Esempio n. 1
0
def compute_or_read_features():
    featurefile = f'{datadir}/plasticc_featuretable.npz'

    def worker(tsobj):
        global features_to_use
        thisfeats = featurize.featurize_single_ts(
            tsobj, features_to_use=features_to_use, raise_exceptions=False)
        return thisfeats

    if os.path.exists(featurefile):
        featuretable, _ = featurize.load_featureset(featurefile)
    else:
        light_curves = read_lightcurves(datadir)
        features_list = []
        with tqdm(total=nobjects, desc="Computing Features") as pbar:
            with multiprocessing.Pool() as pool:
                results = pool.imap(worker, list(light_curves.values()))
                for res in results:
                    features_list.append(res)
                    pbar.update()

        featuretable = featurize.assemble_featureset(
            features_list=features_list, time_series=light_curves.values())
        # Save the computed feature set to a file
        featurize.save_featureset(fset=featuretable, path=featurefile)

    # convert cesium's MultiIndex to an AstroPy Table that is better supported by
    # scikit-learn
    old_names = featuretable.columns.values
    new_names = ['{}_{}'.format(x, pbmap.get(y, 'meta')) for x, y in old_names]
    cols = [featuretable[col] for col in old_names]
    allfeats = Table(cols, names=new_names)
    del featuretable

    return allfeats
Esempio n. 2
0
 def add_file(prediction, create, value, *args, **kwargs):
     train_featureset = prediction.model.featureset
     fset_data, data = featurize.load_featureset(train_featureset.file_uri)
     if 'class' in prediction.dataset.name or 'regr' in prediction.dataset.name:
         labels = data['labels']
     else:
         labels = []
     model_data = joblib.load(prediction.model.file_uri)
     if hasattr(model_data, 'best_estimator_'):
         model_data = model_data.best_estimator_
     preds = model_data.predict(fset_data)
     pred_probs = (pd.DataFrame(model_data.predict_proba(fset_data),
                                index=fset_data.index.astype(str),
                                columns=model_data.classes_) if hasattr(
                                    model_data, 'predict_proba') else [])
     all_classes = model_data.classes_ if hasattr(model_data,
                                                  'classes_') else []
     pred_path = pjoin(TMP_DIR, '{}.npz'.format(str(uuid.uuid4())))
     featurize.save_featureset(fset_data,
                               pred_path,
                               labels=labels,
                               preds=preds,
                               pred_probs=pred_probs)
     prediction.file_uri = pred_path
     DBSession().commit()
Esempio n. 3
0
def test_roundtrip_featureset(tmpdir):
    fset_path = os.path.join(str(tmpdir), 'test.npz')
    for n_channels in [1, 3]:
        for labels in [['class1', 'class2'], []]:
            fset, labels = sample_featureset(3,
                                             n_channels, ['amplitude'],
                                             labels,
                                             names=['a', 'b', 'c'],
                                             meta_features=['meta1'])

            pred_probs = pd.DataFrame(np.random.random((len(fset), 2)),
                                      index=fset.index.values,
                                      columns=['class1', 'class2'])

            featurize.save_featureset(fset,
                                      fset_path,
                                      labels=labels,
                                      pred_probs=pred_probs)
            fset_loaded, data_loaded = featurize.load_featureset(fset_path)
            npt.assert_allclose(fset.values, fset_loaded.values)
            npt.assert_array_equal(fset.index, fset_loaded.index)
            npt.assert_array_equal(fset.columns, fset_loaded.columns)
            assert isinstance(fset_loaded, pd.DataFrame)
            npt.assert_array_equal(labels, data_loaded['labels'])
            npt.assert_allclose(pred_probs, data_loaded['pred_probs'])
            npt.assert_array_equal(pred_probs.columns,
                                   data_loaded['pred_probs'].columns)
Esempio n. 4
0
    def add_file(featureset, create, value, *args, **kwargs):
        if not create:
            return

        if 'class' in featureset.name:
            labels = ['Mira', 'Classical_Cepheid']
        elif 'regr' in featureset.name:
            labels = [2.2, 3.4, 4.4, 2.2, 3.1]
        else:
            labels = []
        fset_data, fset_labels = sample_featureset(5, 1,
                                                   featureset.features_list,
                                                   labels)
        fset_path = pjoin(TMP_DIR, '{}.npz'.format(str(uuid.uuid4())))
        featurize.save_featureset(fset_data, fset_path, labels=fset_labels)
        featureset.file_uri = fset_path
        DBSession().commit()
Esempio n. 5
0
    def post(self):
        data = self.get_json()
        if data['datasetID'] not in [None, 'None']:
            dataset = Dataset.query.filter(Dataset.id == data['datasetID']).one()
        else:
            dataset = None
        current_project = Project.get_if_owned_by(data['projectID'],
                                                  self.current_user)
        feature_data = StringIO(data['dataFile']['body'])
        fset = pd.read_csv(feature_data, index_col=0, header=[0, 1])
        if 'labels' in fset:
            labels = fset.pop('labels').values.ravel()
            if labels.dtype == 'O':
                labels = [str(label) for label in labels]
        else:
            labels = [None]
        fset_path = pjoin(
            self.cfg['paths:features_folder'],
            '{}_{}.npz'.format(uuid.uuid4(), data['dataFile']['name']))

        featurize.save_featureset(fset, fset_path, labels=labels)

        # Meta-features will have channel values of an empty string or a string
        # beginning with 'Unnamed:'
        features_list = [el[0] for el in fset.columns.tolist() if
                         (el[1] != '' and not el[1].startswith('Unnamed:'))]

        featureset = Featureset(name=data['featuresetName'],
                                file_uri=fset_path,
                                project=current_project,
                                dataset=dataset,
                                features_list=features_list,
                                finished=datetime.datetime.now(),
                                custom_features_script=None)
        DBSession().add(featureset)
        DBSession().commit()

        self.success(featureset, 'cesium/FETCH_FEATURESETS')
Esempio n. 6
0
def test_roundtrip_featureset(tmpdir):
    fset_path = os.path.join(str(tmpdir), 'test.npz')
    for n_channels in [1, 3]:
        for labels in [['class1', 'class2'], []]:
            fset, labels = sample_featureset(3, n_channels, ['amplitude'],
                                             labels, names=['a', 'b', 'c'],
                                             meta_features=['meta1'])

            pred_probs = pd.DataFrame(np.random.random((len(fset), 2)),
                                      index=fset.index.values,
                                      columns=['class1', 'class2'])

            featurize.save_featureset(fset, fset_path, labels=labels,
                                      pred_probs=pred_probs)
            fset_loaded, data_loaded = featurize.load_featureset(fset_path)
            npt.assert_allclose(fset.values, fset_loaded.values)
            npt.assert_array_equal(fset.index, fset_loaded.index)
            npt.assert_array_equal(fset.columns, fset_loaded.columns)
            assert isinstance(fset_loaded, pd.DataFrame)
            npt.assert_array_equal(labels, data_loaded['labels'])
            npt.assert_allclose(pred_probs, data_loaded['pred_probs'])
            npt.assert_array_equal(pred_probs.columns,
                                   data_loaded['pred_probs'].columns)
Esempio n. 7
0
def featurize_data(pbmap, pbnames, lcdata, metadata, nobjects, featurefile):
    """
    ***Feature extractor  for PLaSTiCC***

    Extracts features from data by some Cesium library functions.
    Builds a timeseries dictionary and for each time series extracts
    features. Features described in file: feature_sets.
    
    Created on Mon Apr 29 19:30:52 2019
    
    @author: luisarribas
    
    """
    print("")
    print("EXTRACTING FEATURES")
    print("===================")
    print("")
    print("Building Timeseries....wait")
    print("===========================")
    #**********************BUILD TIME SERIES**********************************
    tsdict = OrderedDict()
    for i in range(nobjects):
        row = metadata[i]
        thisid = row['object_id']
        target = row['target']

        meta = {'zBand':row['zBand'],\
                'z':row['hostgal_photoz'],\
                'zerr':row['hostgal_photoz_err'],\
                'mag':row['magnitude'],\
                'u-b':row['u-b'],\
                'b-v':row['b-v']
                }

        ind = (lcdata['object_id'] == thisid)
        thislc = lcdata[ind]

        pbind = [(thislc['passband'] == pb) for pb in pbmap]
        t = [thislc['mjd'][mask].data for mask in pbind]
        m = [thislc['flux'][mask].data for mask in pbind]
        e = [thislc['flux_err'][mask].data for mask in pbind]

        tsdict[thisid] = TimeSeries(t=t, m=m, e=e,\
                            label=target, name=thisid, meta_features=meta,\
                            channel_names=pbnames )

    print("")
    print("OK!")
    print(" ")

    #***********************FEATURE EXTRACTION WITH CESIUM********************
    warnings.simplefilter('ignore')
    if os.path.exists(featurefile):
        print("")
        print("Loading features from file....wait")
        print("==================================")
        featuretable, _ = featurize.load_featureset(featurefile)
        print("")
        print("OK!")
        print(" ")
    else:
        features_list = []
        print("")
        print("Computing features....wait")
        print("==========================")

        with schwimmbad.MultiPool() as pool:
            results = pool.imap(worker, list(tsdict.values()))
            for res in results:
                features_list.append(res)

        featuretable = featurize.assemble_featureset(features_list=features_list,\
                                  time_series=tsdict.values())
        featurize.impute_featureset(fset=featuretable,
                                    strategy='constant',
                                    value=0,
                                    max_value=18446744073709551000,
                                    inplace=True)
        featurize.save_featureset(fset=featuretable, path=featurefile)
        print("")
        print("OK!")
        print(" ")

    #*******Build Pandas dataframe output*************************************
    old_names = featuretable.columns.values
    new_names = ['{}_{}'.format(x, pbmap.get(y, 'meta')) for x, y in old_names]
    cols = [featuretable[col] for col in old_names]
    allfeats = Table(cols, names=new_names, masked=False)
    allfeats['target'] = metadata['target']
    allfeats = allfeats.to_pandas()
    allfeats = np.nan_to_num(allfeats)
    new_names.append('target')
    allfeats = Table(allfeats, names=new_names, masked=False)
    allfeats = allfeats.to_pandas()
    print("")
    print("Extracted features = ", len(allfeats.columns))
    print("==========================")
    print("")
    print("Nan Values detected = ", sum(len(allfeats) - allfeats.count()))
    print("==========================")

    return allfeats
Esempio n. 8
0
    #Occasionally print progress so we can assess speed
    if i % 1000000 == 0:
        print(str(i) + ' done out of ' + str(nobjects))

    
#Empty list for the actual features we want the TimeSeries() object to facilitate
features_list = []
print("Generating features from objects and storing to a table...")

with multiprocessing.Pool() as pool:  
    #Apply the worker function to each object in the dict of TimeSeries() objects. see ./testWorker.py
    #this returns a single row for each object with hundreds of features determined by ./testWorker.py
    results = pool.imap(worker, list(tsdict.values()))
    i = 0
    for res in results:
        #append the created row of features to our list
        features_list.append(res)
        if i % 1000000 == 0:
            print(str(i) + ' done out of ' + str(nobjects))
        i = i +1


# In[35]:

#With features generated for all objects, write the feature list to a file so that we don't need to run this again
featurefile = 'C:/Users/Greg/Documents/Personal/PLAsTiCC/test.npz'
featuretable = featurize.assemble_featureset(features_list=features_list,time_series=tsdict.values())
featurize.save_featureset(fset=featuretable, path=featurefile)
print("Feature Table Created.")