def add_file(prediction, create, value, *args, **kwargs): train_featureset = prediction.model.featureset fset_data, data = featurize.load_featureset(train_featureset.file_uri) if 'class' in prediction.dataset.name or 'regr' in prediction.dataset.name: labels = data['labels'] else: labels = [] model_data = joblib.load(prediction.model.file_uri) if hasattr(model_data, 'best_estimator_'): model_data = model_data.best_estimator_ preds = model_data.predict(fset_data) pred_probs = (pd.DataFrame(model_data.predict_proba(fset_data), index=fset_data.index.astype(str), columns=model_data.classes_) if hasattr( model_data, 'predict_proba') else []) all_classes = model_data.classes_ if hasattr(model_data, 'classes_') else [] pred_path = pjoin(TMP_DIR, '{}.npz'.format(str(uuid.uuid4()))) featurize.save_featureset(fset_data, pred_path, labels=labels, preds=preds, pred_probs=pred_probs) prediction.file_uri = pred_path DBSession().commit()
def compute_or_read_features(): featurefile = f'{datadir}/plasticc_featuretable.npz' def worker(tsobj): global features_to_use thisfeats = featurize.featurize_single_ts( tsobj, features_to_use=features_to_use, raise_exceptions=False) return thisfeats if os.path.exists(featurefile): featuretable, _ = featurize.load_featureset(featurefile) else: light_curves = read_lightcurves(datadir) features_list = [] with tqdm(total=nobjects, desc="Computing Features") as pbar: with multiprocessing.Pool() as pool: results = pool.imap(worker, list(light_curves.values())) for res in results: features_list.append(res) pbar.update() featuretable = featurize.assemble_featureset( features_list=features_list, time_series=light_curves.values()) # Save the computed feature set to a file featurize.save_featureset(fset=featuretable, path=featurefile) # convert cesium's MultiIndex to an AstroPy Table that is better supported by # scikit-learn old_names = featuretable.columns.values new_names = ['{}_{}'.format(x, pbmap.get(y, 'meta')) for x, y in old_names] cols = [featuretable[col] for col in old_names] allfeats = Table(cols, names=new_names) del featuretable return allfeats
def add_file(model, create, value, *args, **kwargs): model_params = { "RandomForestClassifier": { "bootstrap": True, "criterion": "gini", "oob_score": False, "max_features": "auto", "n_estimators": 10, "random_state": 0 }, "RandomForestRegressor": { "bootstrap": True, "criterion": "mse", "oob_score": False, "max_features": "auto", "n_estimators": 10 }, "LinearSGDClassifier": { "loss": "hinge" }, "LinearRegressor": { "fit_intercept": True } } fset_data, data = featurize.load_featureset(model.featureset.file_uri) model_data = MODELS_TYPE_DICT[model.type](**model_params[model.type]) model_data.fit(fset_data, data['labels']) model.file_uri = pjoin('/tmp/', '{}.pkl'.format(str(uuid.uuid4()))) joblib.dump(model_data, model.file_uri) DBSession().commit()
def get(self, prediction_id=None, action=None): if action == 'download': pred_path = Prediction.get_if_owned_by(prediction_id, self.current_user).file_uri fset, data = featurize.load_featureset(pred_path) result = pd.DataFrame(({'label': data['labels']} if len(data['labels']) > 0 else None), index=fset.index) if len(data.get('pred_probs', [])) > 0: result = pd.concat((result, data['pred_probs']), axis=1) else: result['prediction'] = data['preds'] result.index.name = 'ts_name' self.set_header("Content-Type", 'text/csv; charset="utf-8"') self.set_header("Content-Disposition", "attachment; " "filename=cesium_prediction_results.csv") self.write(result.to_csv(index=True)) else: if prediction_id is None: predictions = [prediction for project in self.current_user.projects for prediction in project.predictions] prediction_info = [p.display_info() for p in predictions] else: prediction = Prediction.get_if_owned_by(prediction_id, self.current_user) prediction_info = prediction.display_info() return self.success(prediction_info)
def test_roundtrip_featureset(tmpdir): fset_path = os.path.join(str(tmpdir), 'test.npz') for n_channels in [1, 3]: for labels in [['class1', 'class2'], []]: fset, labels = sample_featureset(3, n_channels, ['amplitude'], labels, names=['a', 'b', 'c'], meta_features=['meta1']) pred_probs = pd.DataFrame(np.random.random((len(fset), 2)), index=fset.index.values, columns=['class1', 'class2']) featurize.save_featureset(fset, fset_path, labels=labels, pred_probs=pred_probs) fset_loaded, data_loaded = featurize.load_featureset(fset_path) npt.assert_allclose(fset.values, fset_loaded.values) npt.assert_array_equal(fset.index, fset_loaded.index) npt.assert_array_equal(fset.columns, fset_loaded.columns) assert isinstance(fset_loaded, pd.DataFrame) npt.assert_array_equal(labels, data_loaded['labels']) npt.assert_allclose(pred_probs, data_loaded['pred_probs']) npt.assert_array_equal(pred_probs.columns, data_loaded['pred_probs'].columns)
def get(self, prediction_id=None, action=None): if action == 'download': prediction = Prediction.get_if_owned_by(prediction_id, self.current_user) pred_path = prediction.file_uri fset, data = featurize.load_featureset(pred_path) result = pd.DataFrame(({'label': data['labels']} if len(data['labels']) > 0 else None), index=fset.index) if len(data.get('pred_probs', [])) > 0: result = pd.concat((result, data['pred_probs']), axis=1) else: result['prediction'] = data['preds'] result.index.name = 'ts_name' self.set_header("Content-Type", 'text/csv; charset="utf-8"') self.set_header( "Content-Disposition", "attachment; " f"filename=cesium_prediction_results_{prediction.project.name}" f"_{prediction.dataset.name}" f"_{prediction.model.name}_{prediction.finished}.csv") self.write(result.to_csv(index=True)) else: if prediction_id is None: predictions = [prediction for project in self.current_user.projects for prediction in project.predictions] prediction_info = [p.display_info() for p in predictions] else: prediction = Prediction.get_if_owned_by(prediction_id, self.current_user) prediction_info = prediction.display_info() return self.success(prediction_info)
def display_info(self): info = self.to_dict() info['model_type'] = self.model.type info['dataset_name'] = self.dataset.name info['model_name'] = self.model.name info['featureset_name'] = self.model.featureset.name if self.task_id is None: fset, data = featurize.load_featureset(self.file_uri) info['isProbabilistic'] = (len(data['pred_probs']) > 0) info['results'] = Prediction.format_pred_data(fset, data) return info
def _build_model_compute_statistics(fset_path, model_type, model_params, params_to_optimize, model_path): '''Build model and return summary statistics. Parameters ---------- fset_path : str Path to feature set .npz file. model_type : str Type of model to be built, e.g. 'RandomForestClassifier'. model_params : dict Dictionary with hyperparameter values to be used in model building. Keys are parameter names, values are the associated parameter values. These hyperparameters will be passed to the model constructor as-is (for hyperparameter optimization, see `params_to_optimize`). params_to_optimize : dict or list of dict During hyperparameter optimization, various model parameters are adjusted to give an optimal fit. This dictionary gives the different values that should be explored for each parameter. E.g., `{'alpha': [1, 2], 'beta': [4, 5, 6]}` would fit models on all 6 combinations of alpha and beta and compare the resulting models' goodness-of-fit. If None, only those hyperparameters specified in `model_parameters` will be used (passed to model constructor as-is). model_path : str Path indicating where serialized model will be saved. Returns ------- score : float The model's training score. best_params : dict Dictionary of best hyperparameter values (keys are parameter names, values are the corresponding best values) determined by `scikit-learn`'s `GridSearchCV`. If no hyperparameter optimization is performed (i.e. `params_to_optimize` is None or is an empty dict, this will be an empty dict. ''' fset, data = featurize.load_featureset(fset_path) if len(data['labels']) != len(fset): raise ValueError("Cannot build model for unlabeled feature set.") n_jobs = (model_params.pop('n_jobs') if 'n_jobs' in model_params and params_to_optimize else -1) model = MODELS_TYPE_DICT[model_type](**model_params) if params_to_optimize: model = GridSearchCV(model, params_to_optimize, n_jobs=n_jobs) model.fit(fset, data['labels']) score = model.score(fset, data['labels']) best_params = model.best_params_ if params_to_optimize else {} joblib.dump(model, model_path) return score, best_params
def feature_scatterplot(fset_path, features_to_plot): """Create scatter plot of feature set. Parameters ---------- fset_path : str Path to feature set to be plotted. features_to_plot : list of str List of feature names to be plotted. Returns ------- (str, str) Returns (docs_json, render_items) json for the desired plot. """ fset, data = featurize.load_featureset(fset_path) fset = fset[features_to_plot] colors = cycle(palette[5]) plots = np.array([[figure(width=300, height=200) for j in range(len(features_to_plot))] for i in range(len(features_to_plot))]) for (j, i), p in np.ndenumerate(plots): if (j == i == 0): p.title.text = "Scatterplot matrix" p.circle(fset.values[:,i], fset.values[:,j], color=next(colors)) p.xaxis.minor_tick_line_color = None p.yaxis.minor_tick_line_color = None p.ygrid[0].ticker.desired_num_ticks = 2 p.xgrid[0].ticker.desired_num_ticks = 4 p.outline_line_color = None p.axis.visible = None plot = gridplot(plots.tolist(), ncol=len(features_to_plot), mergetools=True, responsive=True, title="Test") # Convert plot to json objects necessary for rendering with bokeh on the # frontend render_items = [{'docid': plot._id, 'elementid': make_id()}] doc = Document() doc.add_root(plot) docs_json_inner = doc.to_json() docs_json = {render_items[0]['docid']: docs_json_inner} docs_json = serialize_json(docs_json) render_items = serialize_json(render_items) return docs_json, render_items
def get(self, featureset_id=None, action=None): if action == 'download': featureset = Featureset.get_if_owned_by(featureset_id, self.current_user) fset_path = featureset.file_uri fset, data = featurize.load_featureset(fset_path) if 'labels' in data: fset['labels'] = data['labels'] self.set_header("Content-Type", 'text/csv; charset="utf-8"') self.set_header( "Content-Disposition", "attachment; " f"filename=cesium_featureset_{featureset.project.name}" f"_{featureset.name}_{featureset.finished}.csv") self.write(fset.to_csv(index=True)) else: if featureset_id is not None: featureset_info = Featureset.get_if_owned_by(featureset_id, self.current_user) else: featureset_info = [f for p in self.current_user.projects for f in p.featuresets] featureset_info.sort(key=lambda f: f.created_at, reverse=True) self.success(featureset_info)
def featurize_data(pbmap, pbnames, lcdata, metadata, nobjects, featurefile): """ ***Feature extractor for PLaSTiCC*** Extracts features from data by some Cesium library functions. Builds a timeseries dictionary and for each time series extracts features. Features described in file: feature_sets. Created on Mon Apr 29 19:30:52 2019 @author: luisarribas """ print("") print("EXTRACTING FEATURES") print("===================") print("") print("Building Timeseries....wait") print("===========================") #**********************BUILD TIME SERIES********************************** tsdict = OrderedDict() for i in range(nobjects): row = metadata[i] thisid = row['object_id'] target = row['target'] meta = {'zBand':row['zBand'],\ 'z':row['hostgal_photoz'],\ 'zerr':row['hostgal_photoz_err'],\ 'mag':row['magnitude'],\ 'u-b':row['u-b'],\ 'b-v':row['b-v'] } ind = (lcdata['object_id'] == thisid) thislc = lcdata[ind] pbind = [(thislc['passband'] == pb) for pb in pbmap] t = [thislc['mjd'][mask].data for mask in pbind] m = [thislc['flux'][mask].data for mask in pbind] e = [thislc['flux_err'][mask].data for mask in pbind] tsdict[thisid] = TimeSeries(t=t, m=m, e=e,\ label=target, name=thisid, meta_features=meta,\ channel_names=pbnames ) print("") print("OK!") print(" ") #***********************FEATURE EXTRACTION WITH CESIUM******************** warnings.simplefilter('ignore') if os.path.exists(featurefile): print("") print("Loading features from file....wait") print("==================================") featuretable, _ = featurize.load_featureset(featurefile) print("") print("OK!") print(" ") else: features_list = [] print("") print("Computing features....wait") print("==========================") with schwimmbad.MultiPool() as pool: results = pool.imap(worker, list(tsdict.values())) for res in results: features_list.append(res) featuretable = featurize.assemble_featureset(features_list=features_list,\ time_series=tsdict.values()) featurize.impute_featureset(fset=featuretable, strategy='constant', value=0, max_value=18446744073709551000, inplace=True) featurize.save_featureset(fset=featuretable, path=featurefile) print("") print("OK!") print(" ") #*******Build Pandas dataframe output************************************* old_names = featuretable.columns.values new_names = ['{}_{}'.format(x, pbmap.get(y, 'meta')) for x, y in old_names] cols = [featuretable[col] for col in old_names] allfeats = Table(cols, names=new_names, masked=False) allfeats['target'] = metadata['target'] allfeats = allfeats.to_pandas() allfeats = np.nan_to_num(allfeats) new_names.append('target') allfeats = Table(allfeats, names=new_names, masked=False) allfeats = allfeats.to_pandas() print("") print("Extracted features = ", len(allfeats.columns)) print("==========================") print("") print("Nan Values detected = ", sum(len(allfeats) - allfeats.count())) print("==========================") return allfeats