def load_test_data(filename): data = [] with open(filename) as file: lines = file.read() samples = lines.strip().split("\n\ntest_") for sample in samples: sample = sample.strip() d = sample[8:-1].strip() data.append(d) return skutils.Bunch(data=data)
def load_image_files(dataset_target, dataset_numpy, dimension=(WIDTH, HEIGHT, 3)): flat_data = [] for image in dataset_numpy: # resize the image and add it back to the array img_resized = transform.resize(image, dimension, anti_aliasing=True, mode='reflect') flat_data.append(img_resized.flatten()) flat_data = np.array(flat_data) target = np.array(dataset_target) # make a dict like with numpyarray of image and target return utils.Bunch(data=flat_data, target=target)
def permutation_importance(estimator, X, y, *, scoring=None, n_repeats=5, n_jobs=None, random_state=None): if not DaskToolBox.is_dask_dataframe(X): return sk_inspect.permutation_importance(estimator, X, y, scoring=scoring, n_repeats=n_repeats, n_jobs=n_jobs, random_state=random_state) random_state = sk_utils.check_random_state(random_state) def shuffle_partition(df, col_idx): shuffling_idx = np.arange(df.shape[0]) random_state.shuffle(shuffling_idx) col = df.iloc[shuffling_idx, col_idx] col.index = df.index df.iloc[:, col_idx] = col return df if DaskToolBox.is_dask_object(y): y = y.compute() scorer = sk_metrics.check_scoring( DaskToolBox.wrap_for_local_scorer(estimator, type_of_target(y)), scoring) baseline_score = scorer(estimator, X, y) scores = [] for c in range(X.shape[1]): col_scores = [] for i in range(n_repeats): X_permuted = X.copy().map_partitions(shuffle_partition, c) col_scores.append(scorer(estimator, X_permuted, y)) if logger.is_debug_enabled(): logger.debug(f'permuted scores [{X.columns[c]}]: {col_scores}') scores.append(col_scores) importances = baseline_score - np.array(scores) return sk_utils.Bunch(importances_mean=np.mean(importances, axis=1), importances_std=np.std(importances, axis=1), importances=importances)
def load_train_data(filename): data = [] target = [] target_names = ['pos', 'neg'] with open(filename) as file: lines = file.read() samples = lines.strip().split("\n\ntrain_") for sample in samples: sample = sample.strip() d = sample[8:-3].strip() t = sample[-1:] data.append(d) target.append(t) return skutils.Bunch(data=data, target=target, target_names=target_names)
def load_image_files(dataset_target, dataset_numpy, dimension=(WIDTH, HEIGHT, 3)): flat_data1 = [] for image in dataset_numpy: img_resized = transform.resize(image, dimension, anti_aliasing=True, mode='reflect') # print("this is teh resized") # print(img_resized) flat_data1.append(img_resized.flatten()) # # print(flat_data1) # target1.append(key) flat_data1 = np.array(flat_data1) target1 = np.array(dataset_target) return utils.Bunch(data=flat_data1, target=target1)
def permutation_importance_batch(cls, estimators, X, y, scoring=None, n_repeats=5, n_jobs=None, random_state=None): """Evaluate the importance of features of a set of estimators Parameters ---------- estimators : list A set of estimators that has already been :term:`fitted` and is compatible with :term:`scorer`. X : ndarray or DataFrame, shape (n_samples, n_features) Data on which permutation importance will be computed. y : array-like or None, shape (n_samples, ) or (n_samples, n_classes) Targets for supervised or `None` for unsupervised. scoring : string, callable or None, default=None Scorer to use. It can be a single string (see :ref:`scoring_parameter`) or a callable (see :ref:`scoring`). If None, the estimator's default scorer is used. n_repeats : int, default=5 Number of times to permute a feature. n_jobs : int or None, default=None The number of jobs to use for the computation. `None` means 1 unless in a :obj:`joblib.parallel_backend` context. `-1` means using all processors. See :term:`Glossary <n_jobs>` for more details. random_state : int, RandomState instance, or None, default=None Pseudo-random number generator to control the permutations of each feature. See :term:`random_state`. Returns ------- result : Bunch Dictionary-like object, with attributes: importances_mean : ndarray, shape (n_features, ) Mean of feature importance over `n_repeats`. importances_std : ndarray, shape (n_features, ) Standard deviation over `n_repeats`. importances : ndarray, shape (n_features, n_repeats) Raw permutation importance scores. """ importances = [] X_shape = cls.get_shape(X) if X_shape[0] > c.permutation_importance_sample_limit: if logger.is_info_enabled(): logger.info(f'{X_shape[0]} rows data found, sample to {c.permutation_importance_sample_limit}') frac = c.permutation_importance_sample_limit / X_shape[0] X, _, y, _ = cls.train_test_split(X, y, train_size=frac, random_state=random_state) # if n_jobs is None: # n_jobs = c.joblib_njobs if isinstance(n_jobs, int) and n_jobs <= 0: n_jobs = None # higher performance than -1 for i, est in enumerate(estimators): if logger.is_info_enabled(): logger.info(f'score permutation importance by estimator {i}/{len(estimators)}') importance = cls.permutation_importance(est, X.copy(), y.copy(), scoring=scoring, n_repeats=n_repeats, n_jobs=n_jobs, random_state=random_state) importances.append(importance.importances) importances = np.reshape(np.stack(importances, axis=2), (X.shape[1], -1), 'F') bunch = sk_utils.Bunch(importances_mean=np.mean(importances, axis=1), importances_std=np.std(importances, axis=1), importances=importances, columns=X.columns.to_list()) return bunch
# looking backwards. # algo: y = (2 * x) - 1 # model.predict(3) = 8 # model.determine(8) = 3 func_y = lambda x: (2 * x) - 1 X = range(1200) y = [func_y(i) for i in X] import sklearn import numpy as np from sklearn import utils dataset = utils.Bunch() dataset.data = np.array(X).reshape(-1, 1) dataset.target = np.array(y).reshape(-1, 1) from sklearn import linear_model model = linear_model.LinearRegression() model.fit(dataset.data, dataset.target) test_X = np.array([i for i in range(3, 1200)]).reshape(-1, 1) test_y = np.array([func_y(i) for i in test_X]).reshape(-1, 1) pred_y = model.predict(test_X) from sklearn import metrics scorer = metrics.explained_variance_score scorer(test_y, pred_y)
df = pd.read_csv(filepath + filename, header=0, names=names) for row in df.index: # print(df.at[row, 'gender']) if df.at[row, 'gender'] == str('male'): # print('male') df.at[row, 'gender'] = int(0) elif df.at[row, 'gender'] == str('female'): # print('female') df.at[row, 'gender'] = int(1) else: df.at[row, 'gender'] = int(0) # print('other') fb_dataset = util.Bunch() fb_dataset.data = toNum(np.array(df)) fb_dataset.target = np.array([int(df.at[row, 'age']) for row in df.index]) print("filename", filepath + filename) print("e-differential privacy") X_train, X_test, y_train, y_test = fb_dataset.data, fb_dataset.data, fb_dataset.target, fb_dataset.target epsilons = np.logspace(-2, 2, 50) minbounds = np.amin(X_train, axis=0) maxbounds = np.amax(X_train, axis=0) bounds = [(minbounds[i], maxbounds[i]) for i in range(X_train[0].size)] accuracy = list() epsilon = 1 clf = models.GaussianNB(bounds=bounds, epsilon=epsilon) clf.fit(X_train, y_train)
def load_data(imgfilepath, targetfilepath): saveddata = np.loadtxt(imgfilepath, delimiter=',') target = np.loadtxt(targetfilepath, delimiter=',') images = [] flat_data = [] # need to still add target and descr #target=[] descr = [] fig = plt.figure() count = 0 cols = 20 n_images = len(saveddata) * 2 for i in saveddata: # reshape the image from 10000*1 to 100*100 i = i.reshape([100, 100]) # we rotate the image i = image_generator(i, 180) images.append(i) # add flat data to array # reshape the image from 10000*1 to 100*100 i = i.reshape([ 10000, ]) if len(flat_data) == 0: flat_data = i else: flat_data = np.vstack([flat_data, i]) rottargets = [] rotdata = [] targetnew = [] # count = 0 # run this multiple times to create random angle images for x in xrange(5): for t, i in zip(target, saveddata): # reshape the image from 10000*1 to 100*100 i = i.reshape([100, 100]) # we rotate the image at random angle rotation = np.random.randint(1, 359) i = image_generator(i, rotation) # add the image to list of images images.append(i) # now we can add the targets i = i.reshape([ 10000, ]) # flat_data.append(i) flat_data = np.vstack([flat_data, i]) # add target for the image targetnew.append(t) target = np.concatenate([target, np.array(targetnew, float)]) # plot it #fig.set_size_inches(np.array(fig.get_size_inches()) * (n_images-1)) fig.tight_layout() # uncomment this to show image #plt.show() return utils.Bunch(data=flat_data, target=target, target_names=np.arange(10), images=images, DESCR=descr)
unprocessed_symbols.append(symbol) continue logger.info("Data downloaded") data = {} for symbols_index, symbol in enumerate(symbols): if symbol in unprocessed_symbols: continue data_raw_norm = data_raw[symbol].divide(data_raw[symbol].iloc[0]) - 1 for column in data_raw_norm.columns: if (pd.notnull(data_raw_norm[[column]]) & np.isfinite(data_raw_norm[[column]])).all()[0] == False: data_raw_norm[[column]] = data_raw[symbol][[column]] data[symbol] = sku.Bunch(data=data_raw_norm[:-1], target=np.ravel(data_raw_norm[['AdjClose']])[1:]) logger.info("Symbols normalized") def decode_data(data): st = zlib.decompress(base64.b64decode(data)) return pickle.loads(st) def encode_data(data): p = pickle.dumps(data) z = zlib.compress(p) return base64.b64encode(z)