Exemple #1
0
def load_test_data(filename):
    data = []

    with open(filename) as file:
        lines = file.read()

    samples = lines.strip().split("\n\ntest_")

    for sample in samples:
        sample = sample.strip()
        d = sample[8:-1].strip()
        data.append(d)

    return skutils.Bunch(data=data)
Exemple #2
0
def load_image_files(dataset_target, dataset_numpy, dimension=(WIDTH, HEIGHT, 3)):

    flat_data = []

    for image in dataset_numpy:
        # resize the image and add it back to the array
        img_resized = transform.resize(image, dimension, anti_aliasing=True, mode='reflect')
        flat_data.append(img_resized.flatten())

    flat_data = np.array(flat_data)
    target = np.array(dataset_target)

    # make a dict like with numpyarray of image and target
    return utils.Bunch(data=flat_data,
                 target=target)
Exemple #3
0
    def permutation_importance(estimator,
                               X,
                               y,
                               *,
                               scoring=None,
                               n_repeats=5,
                               n_jobs=None,
                               random_state=None):
        if not DaskToolBox.is_dask_dataframe(X):
            return sk_inspect.permutation_importance(estimator,
                                                     X,
                                                     y,
                                                     scoring=scoring,
                                                     n_repeats=n_repeats,
                                                     n_jobs=n_jobs,
                                                     random_state=random_state)
        random_state = sk_utils.check_random_state(random_state)

        def shuffle_partition(df, col_idx):
            shuffling_idx = np.arange(df.shape[0])
            random_state.shuffle(shuffling_idx)
            col = df.iloc[shuffling_idx, col_idx]
            col.index = df.index
            df.iloc[:, col_idx] = col
            return df

        if DaskToolBox.is_dask_object(y):
            y = y.compute()

        scorer = sk_metrics.check_scoring(
            DaskToolBox.wrap_for_local_scorer(estimator, type_of_target(y)),
            scoring)
        baseline_score = scorer(estimator, X, y)
        scores = []

        for c in range(X.shape[1]):
            col_scores = []
            for i in range(n_repeats):
                X_permuted = X.copy().map_partitions(shuffle_partition, c)
                col_scores.append(scorer(estimator, X_permuted, y))
            if logger.is_debug_enabled():
                logger.debug(f'permuted scores [{X.columns[c]}]: {col_scores}')
            scores.append(col_scores)

        importances = baseline_score - np.array(scores)
        return sk_utils.Bunch(importances_mean=np.mean(importances, axis=1),
                              importances_std=np.std(importances, axis=1),
                              importances=importances)
Exemple #4
0
def load_train_data(filename):
    data = []
    target = []
    target_names = ['pos', 'neg']

    with open(filename) as file:
        lines = file.read()

    samples = lines.strip().split("\n\ntrain_")

    for sample in samples:
        sample = sample.strip()
        d = sample[8:-3].strip()
        t = sample[-1:]
        data.append(d)
        target.append(t)

    return skutils.Bunch(data=data, target=target, target_names=target_names)
Exemple #5
0
def load_image_files(dataset_target,
                     dataset_numpy,
                     dimension=(WIDTH, HEIGHT, 3)):
    flat_data1 = []

    for image in dataset_numpy:
        img_resized = transform.resize(image,
                                       dimension,
                                       anti_aliasing=True,
                                       mode='reflect')

        # print("this is teh resized")
        # print(img_resized)

        flat_data1.append(img_resized.flatten())

        # # print(flat_data1)
        # target1.append(key)

    flat_data1 = np.array(flat_data1)
    target1 = np.array(dataset_target)

    return utils.Bunch(data=flat_data1, target=target1)
Exemple #6
0
    def permutation_importance_batch(cls, estimators, X, y, scoring=None, n_repeats=5,
                                     n_jobs=None, random_state=None):
        """Evaluate the importance of features of a set of estimators

        Parameters
        ----------
        estimators : list
            A set of estimators that has already been :term:`fitted` and is compatible
            with :term:`scorer`.

        X : ndarray or DataFrame, shape (n_samples, n_features)
            Data on which permutation importance will be computed.

        y : array-like or None, shape (n_samples, ) or (n_samples, n_classes)
            Targets for supervised or `None` for unsupervised.

        scoring : string, callable or None, default=None
            Scorer to use. It can be a single
            string (see :ref:`scoring_parameter`) or a callable (see
            :ref:`scoring`). If None, the estimator's default scorer is used.

        n_repeats : int, default=5
            Number of times to permute a feature.

        n_jobs : int or None, default=None
            The number of jobs to use for the computation.
            `None` means 1 unless in a :obj:`joblib.parallel_backend` context.
            `-1` means using all processors. See :term:`Glossary <n_jobs>`
            for more details.

        random_state : int, RandomState instance, or None, default=None
            Pseudo-random number generator to control the permutations of each
            feature. See :term:`random_state`.

        Returns
        -------
        result : Bunch
            Dictionary-like object, with attributes:

            importances_mean : ndarray, shape (n_features, )
                Mean of feature importance over `n_repeats`.
            importances_std : ndarray, shape (n_features, )
                Standard deviation over `n_repeats`.
            importances : ndarray, shape (n_features, n_repeats)
                Raw permutation importance scores.
        """
        importances = []

        X_shape = cls.get_shape(X)
        if X_shape[0] > c.permutation_importance_sample_limit:
            if logger.is_info_enabled():
                logger.info(f'{X_shape[0]} rows data found, sample to {c.permutation_importance_sample_limit}')
            frac = c.permutation_importance_sample_limit / X_shape[0]
            X, _, y, _ = cls.train_test_split(X, y, train_size=frac, random_state=random_state)

        # if n_jobs is None:
        #     n_jobs = c.joblib_njobs
        if isinstance(n_jobs, int) and n_jobs <= 0:
            n_jobs = None  # higher performance than -1

        for i, est in enumerate(estimators):
            if logger.is_info_enabled():
                logger.info(f'score permutation importance by estimator {i}/{len(estimators)}')
            importance = cls.permutation_importance(est, X.copy(), y.copy(),
                                                    scoring=scoring, n_repeats=n_repeats, n_jobs=n_jobs,
                                                    random_state=random_state)
            importances.append(importance.importances)

        importances = np.reshape(np.stack(importances, axis=2), (X.shape[1], -1), 'F')
        bunch = sk_utils.Bunch(importances_mean=np.mean(importances, axis=1),
                               importances_std=np.std(importances, axis=1),
                               importances=importances,
                               columns=X.columns.to_list())
        return bunch
Exemple #7
0
# looking backwards.

# algo: y = (2 * x) - 1
# model.predict(3) = 8
# model.determine(8) = 3
func_y = lambda x: (2 * x) - 1

X = range(1200)
y = [func_y(i) for i in X]

import sklearn
import numpy as np

from sklearn import utils

dataset = utils.Bunch()
dataset.data = np.array(X).reshape(-1, 1)
dataset.target = np.array(y).reshape(-1, 1)

from sklearn import linear_model
model = linear_model.LinearRegression()

model.fit(dataset.data, dataset.target)

test_X = np.array([i for i in range(3, 1200)]).reshape(-1, 1)
test_y = np.array([func_y(i) for i in test_X]).reshape(-1, 1)
pred_y = model.predict(test_X)

from sklearn import metrics
scorer = metrics.explained_variance_score
scorer(test_y, pred_y)
Exemple #8
0
df = pd.read_csv(filepath + filename, header=0, names=names)

for row in df.index:
    # print(df.at[row, 'gender'])
    if df.at[row, 'gender'] == str('male'):
        # print('male')
        df.at[row, 'gender'] = int(0)
    elif df.at[row, 'gender'] == str('female'):
        # print('female')
        df.at[row, 'gender'] = int(1)
    else:
        df.at[row, 'gender'] = int(0)
        # print('other')

fb_dataset = util.Bunch()
fb_dataset.data = toNum(np.array(df))
fb_dataset.target = np.array([int(df.at[row, 'age']) for row in df.index])

print("filename", filepath + filename)
print("e-differential privacy")
X_train, X_test, y_train, y_test = fb_dataset.data, fb_dataset.data, fb_dataset.target, fb_dataset.target
epsilons = np.logspace(-2, 2, 50)
minbounds = np.amin(X_train, axis=0)
maxbounds = np.amax(X_train, axis=0)
bounds = [(minbounds[i], maxbounds[i]) for i in range(X_train[0].size)]

accuracy = list()
epsilon = 1
clf = models.GaussianNB(bounds=bounds, epsilon=epsilon)
clf.fit(X_train, y_train)
Exemple #9
0
def load_data(imgfilepath, targetfilepath):
    saveddata = np.loadtxt(imgfilepath, delimiter=',')
    target = np.loadtxt(targetfilepath, delimiter=',')
    images = []
    flat_data = []

    # need to still add target and descr
    #target=[]
    descr = []

    fig = plt.figure()

    count = 0
    cols = 20
    n_images = len(saveddata) * 2

    for i in saveddata:

        # reshape the image from 10000*1 to 100*100
        i = i.reshape([100, 100])

        # we rotate the image
        i = image_generator(i, 180)
        images.append(i)

        # add flat data to array
        # reshape the image from 10000*1 to 100*100
        i = i.reshape([
            10000,
        ])
        if len(flat_data) == 0:
            flat_data = i
        else:
            flat_data = np.vstack([flat_data, i])

    rottargets = []
    rotdata = []
    targetnew = []
    # count = 0

    # run this multiple times to create random angle images
    for x in xrange(5):
        for t, i in zip(target, saveddata):
            # reshape the image from 10000*1 to 100*100
            i = i.reshape([100, 100])

            # we rotate the image at random angle
            rotation = np.random.randint(1, 359)
            i = image_generator(i, rotation)
            # add the image to list of images
            images.append(i)

            # now we can add the targets
            i = i.reshape([
                10000,
            ])
            # flat_data.append(i)
            flat_data = np.vstack([flat_data, i])

            # add target for the image
            targetnew.append(t)

    target = np.concatenate([target, np.array(targetnew, float)])

    # plot it
    #fig.set_size_inches(np.array(fig.get_size_inches()) * (n_images-1))
    fig.tight_layout()
    # uncomment this to show image
    #plt.show()

    return utils.Bunch(data=flat_data,
                       target=target,
                       target_names=np.arange(10),
                       images=images,
                       DESCR=descr)
Exemple #10
0
        unprocessed_symbols.append(symbol)
        continue

logger.info("Data downloaded")

data = {}
for symbols_index, symbol in enumerate(symbols):
    if symbol in unprocessed_symbols:
        continue

    data_raw_norm = data_raw[symbol].divide(data_raw[symbol].iloc[0]) - 1
    for column in data_raw_norm.columns:
        if (pd.notnull(data_raw_norm[[column]])
                & np.isfinite(data_raw_norm[[column]])).all()[0] == False:
            data_raw_norm[[column]] = data_raw[symbol][[column]]
    data[symbol] = sku.Bunch(data=data_raw_norm[:-1],
                             target=np.ravel(data_raw_norm[['AdjClose']])[1:])

logger.info("Symbols normalized")


def decode_data(data):
    st = zlib.decompress(base64.b64decode(data))
    return pickle.loads(st)


def encode_data(data):
    p = pickle.dumps(data)
    z = zlib.compress(p)
    return base64.b64encode(z)