def summary(n_samples, min_n_clusters, max_n_clusters, n_features, random_state=0, n_loops=10): gm_01_scores = [] gm_02_scores = [] xm_scores = [] for n_cluster in np.linspace(min_n_clusters, max_n_clusters, max_n_clusters - min_n_clusters + 1): total_gm_01_score = 0 total_gm_02_score = 0 total_xm_score = 0 for i in range(n_loops): n_cluster = n_cluster.astype(int) X, y = datasets.make_blobs(n_samples=n_samples, n_features=n_features, centers=n_cluster, random_state=random_state) X = StandardScaler().fit_transform(X) X_1 = X.copy() X_2 = X.copy() X_3 = X.copy() gm_01 = GMeans_01().fit(X_1) gm_02 = GMeans_02().fit(X_2) xm = XMeans().fit(X_3) gm_01_score = silhouette_score(X_1, gm_01.labels, metric='euclidean') gm_02_score = silhouette_score(X_2, gm_02.labels, metric='euclidean') xm_score = silhouette_score(X_3, xm.labels_, metric='euclidean') # print "xm = {}".format(xm_score) total_gm_01_score += gm_01_score total_gm_02_score += gm_02_score total_xm_score += xm_score total_gm_01_score = total_gm_01_score / (n_loops * 1.0) total_gm_02_score = total_gm_02_score / (n_loops * 1.0) total_xm_score = total_xm_score / (n_loops * 1.0) print "n_samples = {}, n_features = {}, n_cluster = {}, gm_01_score = {}, gm_02_score = {}, xm_score = {}" \ .format(n_samples, n_features, n_cluster, total_gm_01_score, total_gm_02_score, total_xm_score) gm_01_scores.append(total_gm_01_score) gm_02_scores.append(total_gm_02_score) xm_scores.append(total_xm_score) return gm_01_scores, gm_02_scores, xm_scores
def predict(self, X=None): """Calculate connectivity-based outlier factor for each sample in A Parameters ---------- X : array-like, shape (n_samples, n_features) New data to predict. Returns ------- probs : array, shape (n_samples,) Outlier probabilities determined by stochastic outlier selection. """ if X is None: if self.X_ is None: raise Exception("No data") X = self.X_ log_format = '%(asctime)-15s [%(levelname)s] - %(name)s: %(message)s' logging.basicConfig(format=log_format, level=logging.INFO) logger = logging.getLogger('SOS') if self.verbose: logger.setLevel(logging.DEBUG) else: logger.setLevel(logging.ERROR) if self.standard_scale: X = StandardScaler().fit_transform(X.copy()) return sos(X, self.metric, self.perplexity, logger=logger)
def test_transformers_data_not_an_array(): # test if transformers do something sensible on training set # also test all shapes / shape errors transformers = all_estimators(type_filter='transformer') X, y = make_blobs(n_samples=30, centers=[[0, 0, 0], [1, 1, 1]], random_state=0, n_features=2, cluster_std=0.1) X = StandardScaler().fit_transform(X) # We need to make sure that we have non negative data, for things # like NMF X -= X.min() - .1 for name, Transformer in transformers: # XXX: some transformers are transforming the input # data. This is a bug that we'll fix later. Right now we copy # the data each time this_X = NotAnArray(X.copy()) this_y = NotAnArray(np.asarray(y)) if name in dont_test: continue # these don't actually fit the data: if name in ['AdditiveChi2Sampler', 'Binarizer', 'Normalizer']: continue # And these wan't multivariate output if name in ('PLSCanonical', 'PLSRegression', 'CCA', 'PLSSVD'): continue yield check_transformer, name, Transformer, this_X, this_y
def get_feature_patches(FV, patch_size, patch_shift, input_shape): FV = StandardScaler(copy=False).fit_transform(FV) # FV should be of the shape (nFeatures, nFrames) if any(np.array([9, 10, 21, 22, 39]) == np.shape(FV)[1]): FV = FV.T patches = np.empty([]) if np.shape(FV)[1] < patch_size: FV1 = FV.copy() while np.shape(FV)[1] <= patch_size: FV = np.append(FV, FV1, axis=1) numPatches = int(np.ceil(np.shape(FV)[1] / patch_shift)) patches = PatchExtractor(patch_size=(np.shape(FV)[0], patch_size), max_patches=numPatches).transform( np.expand_dims(FV, axis=0)) # print('sklearn splitting: ', time.clock()-startTime, np.shape(patches)) # print('Patches: ', np.shape(patches)) if (np.shape(patches)[1] == 9) or (np.shape(patches)[1] == 10): diff_dim = input_shape[0] - np.shape(patches)[1] zero_padding = np.zeros( (np.shape(patches)[0], diff_dim, np.shape(patches)[2])) patches = np.append(patches, zero_padding, axis=1) elif np.shape(patches)[1] == 22: patches = patches[:, :21, :] elif np.shape(patches)[1] == 39: first_7_cep_dim = np.array( list(range(0, 7)) + list(range(13, 20)) + list(range(26, 33))) patches = patches[:, first_7_cep_dim, :] # print('Patches: ', np.shape(patches)) return patches
def get_feature_patches(FV, patch_size, patch_shift, input_shape): FV = StandardScaler(copy=False).fit_transform(FV) # FV should be of the shape (nFeatures, nFrames) if any(np.array([9,10,21,22,39])==np.shape(FV)[1]): FV = FV.T patches = np.empty([]) if np.shape(FV)[1]<patch_size: # print('Size append: ', np.shape(FV), patch_size) FV1 = FV.copy() while np.shape(FV)[1]<=patch_size: FV = np.append(FV, FV1, axis=1) numPatches = int(np.ceil(np.shape(FV)[1]/patch_shift)) patches = PatchExtractor(patch_size=(np.shape(FV)[0], patch_size), max_patches=numPatches).transform(np.expand_dims(FV, axis=0)) patches_mean = np.mean(patches, axis=2) patches_var = np.var(patches, axis=2) patches_mean_var = np.append(patches_mean, patches_var, axis=1) # print('sklearn splitting: ', time.clock()-startTime, np.shape(patches)) # print('Patches: ', np.shape(patches)) if np.shape(patches_mean_var)[1]!=2*input_shape[0]: # This condition checks for 39CC if np.shape(patches_mean_var)[1]==44: patches_mean_var = patches_mean_var[:,list(range(0,21))+list(range(22,43))] elif np.shape(patches_mean_var)[1]==78: first_7_cep_dim = np.array(list(range(0,7))+list(range(13,20))+list(range(26,33))+list(range(39,46))+list(range(52,59))+list(range(65,72))) patches_mean_var = patches_mean_var[:, first_7_cep_dim] # print('patches_mean_var: ', np.shape(patches_mean_var)) return patches_mean_var
coef = np.zeros(n_features) coef[:n_relevant_features] = coef_min + rng.rand(n_relevant_features) # the correction of our design: variables corrected by blocs of 3 corr = np.zeros((n_features, n_features)) for i in range(0, n_features, block_size): corr[i:i + block_size, i:i + block_size] = 1 - conditioning corr.flat[::n_features + 1] = 1 corr = linalg.cholesky(corr) # our design x = rng.normal(size=(n_samples, n_features)) x = np.dot(x, corr) x[:n_relevant_features] /= np.abs(linalg.svdvals( x[:n_relevant_features])).max() x = StandardScaler().fit_transform(x.copy()) # the output variable y = np.dot(x, coef) y /= np.std(y) y += noise_level * rng.normal(size=n_samples) mi = mutual_incoherence(x[:, :n_relevant_features], x[:, n_relevant_features:]) # plot stability selection path, using a high eps for early stopping of the path, to save computing time alpha_grid, scores_path = lasso_stability_path(x, y, random_state=42, eps=0.05) plt.figure()
def dj_rec(track_id): '''get similar songs depending on the audio features''' neighbors = 4 max_distance = 5.0 ''' [:-10] will return only 10 closest songs to the original track_id by removing [:-10], code will return 20 songs. It will take double of time to make a prediction though''' rel_artists = sp.artist_related_artists( sp.track(track_id=track_id)['artists'][0]['id'])['artists'][:-10] artist_log = [] for a in rel_artists: artist_log.append(a['id']) feat_log = [] for artist in artist_log: for track in sp.artist_top_tracks(artist)['tracks']: feat_log.append(sp.audio_features(track['id'])[0]) catalog = pd.DataFrame.from_dict(feat_log) root = pd.DataFrame.from_dict(sp.audio_features(tracks=[track_id])) merged_df = root.append(catalog, ignore_index=True) dropped_df = merged_df.drop(columns=[ 'uri', 'track_href', 'id', 'duration_ms', 'time_signature', 'mode', 'loudness', 'type', 'analysis_url' ]) scaled_df = StandardScaler().fit_transform(dropped_df) trans_array = scaled_df.copy() trans_array[:, 0] = [u * 2.4 for u in trans_array[:, 0]] # acousticness trans_array[:, 1] = [((u * u)**0.5) * u for u in trans_array[:, 1]] # danceability trans_array[:, 2] = [u * 1.7 for u in trans_array[:, 2]] # energy trans_array[:, 3] = [u * 1.4 for u in trans_array[:, 3]] # instrumentalness trans_array[:, 4] = [u * 0.9 for u in trans_array[:, 4]] # key trans_array[:, 5] = [u * 1.0 for u in trans_array[:, 5]] # liveness trans_array[:, 6] = [u * 1.0 for u in trans_array[:, 6]] # speechiness trans_array[:, 7] = [u * 1.1 for u in trans_array[:, 7]] # tempo trans_array[:, 8] = [u * 2.5 for u in trans_array[:, 8]] # valence knn = NearestNeighbors() knn.fit(trans_array) rec = knn.kneighbors(trans_array[[0]], n_neighbors=neighbors + 1) predict_response = [] for n in range(1, neighbors + 1): if rec[0][0][n] <= max_distance: pred_dict = (merged_df.loc[rec[1][0][n], 'id'], rec[0][0][n]) predict_response.append(pred_dict) pred = pd.DataFrame(predict_response, columns=['recommendation', 'distance']) df_predict_tracks = pd.DataFrame() # create dataframe a = [sp.track(ii)['artists'][0]['name'] for ii in pred['recommendation']] b = [sp.track(ii)['name'] for ii in pred['recommendation']] c = [sp.track(ii)['id'] for ii in pred['recommendation']] d = [ sp.track(ii)['external_urls']['spotify'] for ii in pred['recommendation'] ] e = [sp.track(ii)['explicit'] for ii in pred['recommendation']] f = [sp.track(ii)['preview_url'] for ii in pred['recommendation']] g = [ sp.track(ii)['album']['images'][1]['url'] for ii in pred['recommendation'] ] # Save the results df_predict_tracks['artist_name'] = a df_predict_tracks['song_name'] = b df_predict_tracks['id'] = c df_predict_tracks['url'] = d df_predict_tracks['explicit'] = e df_predict_tracks['preview'] = f df_predict_tracks['image'] = g df_predict_tracks['preview'] = df_predict_tracks['preview'].apply( get_rid_of_nulls) df_predict_tracks.index += 1 return json.dumps(json.loads(df_predict_tracks.to_json(orient='index')), indent=2)
def reduce_dim(no_of_components, U, X): U_red = U[:, :no_of_components] X = np.array(X) Z = np.matmul(U_red.T, X.T) Z = Z.T Z_new = pd.DataFrame(Z, columns=["pc" + str(i) for i in xrange(Z.shape[1])]) return Z_new # In[4]: U_copy = U.copy() X_copy = X.copy() Z_1 = reduce_dim(17, U_copy, X_copy) U_copy = U.copy() X_copy = X.copy() Z_2 = reduce_dim(26, U_copy, X_copy) U_copy = U.copy() X_copy = X.copy() Z_3 = reduce_dim(38, U_copy, X_copy) # In[5]: print Z_1.shape print Z_2.shape print Z_3.shape
# x_data = processing.snv(x_data) x_data = StandardScaler().fit_transform(x_data) for i, test_model in enumerate(models_to_test): for j, y_transformation in enumerate(y_transformations): for k, x_transformation in enumerate(x_transformations): title = "{0}\ny transformer: {1}" \ "\nx transformer: {2}".format(model_names[i], transormation_names[j], transormation_names[k]) if x_transformation: x_data_new = x_transformation(x_data) else: x_data_new = x_data.copy() if type(y_transformation) == tuple: _model = TransformedTargetRegressor( regressor=clone(test_model), func=y_transformation[0], inverse_func=y_transformation[1]) elif y_transformation: _model = TransformedTargetRegressor( regressor=clone(test_model), transformer=y_transformation) else: _model = clone(test_model) print(title) try:
Y['AGE2'] = Y['AGE']**2 # Y['SITE'] = Y['SITE'].astype('category').cat.codes # Y['PTRACCAT'] = Y['PTRACCAT'].astype('category').cat.codes Y['PTGENDER'] = Y['PTGENDER'].astype('category').cat.codes Y = Y.fillna(Y.mean()) print(Y.shape) print(X.shape) print(icv.shape) W = np.linalg.inv(Y.T.dot(Y)).dot(Y.T.dot(X)) # Substract effect of age X = X - Y.dot(W) # Save_corrected_data: cdf cdf = X.copy() cdf.columns = cols cdf['label'] = df['label'] corrected_csv = join(data_csv[:-4] + '_corrected.csv') cdf.to_csv(corrected_csv) print('CSV saved!') # Reduce dimmensionality: X_ld n_comp = 5 if X.shape[1] > 5 else X.shape[1] X_ld = PCA(n_components=n_comp).fit_transform(X) #Convert in DataFrame cols = ['PC%d' % (c + 1) for c in range(X_ld.shape[1])] X_ld = pd.DataFrame(data=X_ld, columns=cols, index=df.index) X_ld['label'] = df['label'] X_ld = X_ld[(X_ld['label'] == 'MCIc') | (X_ld['label'] == 'MCInc')]
def get_subsampling_index2(data_process, standard_scale = True, cutoff_sig = 0.02, rate = 0.3, \ method = "pykdtree", verbose = 1): """ Using Nearest-Neighbor search based algorithm, find the list of indices of the subsampled dataset Parameters ------------- data_process: List. the list of datapoints, with selected features standard_scale [True]: Boolean. Whether to apply standard scaler to the dataset prior to subsampling cutoff_sig [0.02]: Float. cutoff significance. the cutoff distance equals to the Euclidean norm of the standard deviations in all dimensions of the data points rate [0.3]: Float. possibility of deletion method ["pykdtree"]: String. which backend nearest neighbour model to use. possible choices: ["pykdtree", "nmslib", "sklearn", "scipy", "annoy", "flann"] verbose [1]: integer. level of verbosity Return ------------- overall_keep_list: The list of indices of the final subsampled entries """ if verbose >= 1: print("Started NN-subsampling, original length: {}".format( len(data_process))) method = method.lower() start = time.time() if method == "flann": if verbose >= 1: print("use flann backend") elif method == "pykdtree": if verbose >= 1: print("use pykdtree backend") elif method == "sklearn": if verbose >= 1: print("use slearn nearest neighbors backend") elif method == "scipy": if verbose >= 1: print("use scipy cKDTree backend") elif method == "annoy": if verbose >= 1: print("use annoy backend") elif method == "nmslib": if verbose >= 1: print("use nmslib backend") else: print("method {} not impletemented".format(method)) raise NotImplemented # apply standard scaling if standard_scale: if verbose >= 2: print("Subample with standard scaled data") data_process = StandardScaler().fit_transform( np.asarray(data_process).copy()) else: if verbose >= 2: print("Subample with original data") data_process = np.asarray(data_process).copy() #set cutoff distance list_of_descs = zip(*data_process) sum_std2 = 0. for descs in list_of_descs: temp_std = np.std(descs) sum_std2 += temp_std**2 cutoff = cutoff_sig * np.sqrt(sum_std2) #initialize the index overall_keep_list = np.arange(len(data_process)).tolist() keep_going = True iter_count = 1 while keep_going: if verbose >= 2: print('start iteration {}, total length: {}'.format( iter_count, len(overall_keep_list))) start_cycle = time.time() temp_data_process = get_array_based_on_index(data_process.copy(), overall_keep_list) #build and query nearest neighbour model if method == "flann": flann = FLANN() indices, distances = flann.nn(temp_data_process, temp_data_process, 2, algorithm="kmeans") elif method == "scipy": kd_tree = cKDTree(temp_data_process) distances, indices = kd_tree.query(temp_data_process, k=2) elif method == "pykdtree": kd_tree = KDTree(temp_data_process, leafsize=6) distances, indices = kd_tree.query(temp_data_process, k=2) elif method == "sklearn": nbrs = NearestNeighbors(n_neighbors=2, algorithm='kd_tree', n_jobs=-1).fit(temp_data_process) distances, indices = nbrs.kneighbors(temp_data_process) elif method == "annoy": annoy = AnnoyIndex(len(temp_data_process[0]), metric='euclidean') for i in range(len(temp_data_process)): annoy.add_item(i, temp_data_process[i]) annoy.build(1) distances = [] indices = [] for i in range(len(temp_data_process)): temp_index, temp_dist = annoy.get_nns_by_vector( temp_data_process[i], 2, include_distances=True) indices.append([i, temp_index[1]]) distances.append([0.0, temp_dist[1]]) elif method == "nmslib": index = nmslib.init(method='hnsw', space='l2') index.addDataPointBatch(temp_data_process) index.createIndex(print_progress=False) neighbours = index.knnQueryBatch(temp_data_process, k=2) distances = [] indices = [] for item in neighbours: indices.append(item[0]) distances.append(item[1]) else: raise NotImplemented # if distance between each point and its nearest neighbor is below cutoff distance, # add the nearest neighbout to the candidate removal list remove_index_li = [] index_li = [] for index, distance in zip(indices, distances): index_li.append(index[0]) if distance[1] <= cutoff: remove_index_li.append(index[1]) # randomly select datapoints in the candidate removal list (based on rate) # and form the final removal list of this iteration # stop the cycle if the final removal list is empty temp_num = int(ceil(float(len(remove_index_li)) * rate)) if temp_num == 0: keep_going = False remove_index_li = random_subsampling(remove_index_li, temp_num) temp_keep_list = remove_list_from_list(index_li, remove_index_li) overall_keep_list = [overall_keep_list[i] for i in temp_keep_list] if verbose >= 2: print('end iteration {}. length: {}\t time:{}'.format( iter_count, len(overall_keep_list), time.time() - start_cycle)) iter_count += 1 if verbose >= 1: print('end NN-subsampling. length: {}\t time:{}'.format( len(overall_keep_list), time.time() - start)) return overall_keep_list
class Cell2Patients(): def __init__(self, clf, threshold, clf_transfer=None, max_run=None, out_dir="./semi_supervised/", field_separator="\t", increment_rate=10, verbose=True, normalize=False): self._clf = clone(clf) if clf_transfer == None: self._clf_transfer = clone(clf) else: self._clf_transfer = clf_transfer self._normalize = normalize self._increment_rate = increment_rate self._max_run = max_run self._out_dir = os.path.realpath(out_dir) os.makedirs(out_dir, exist_ok=True) self._Xl = [] self._Xu = [] self._names_l = [] self._features_l = [] self._names_u = [] self._features_u = [] self._Y = [] self._patient_added = [] self._verbose = verbose self._FS = field_separator self._leftover = "" self._thr = threshold if self._thr < 0.51 or self._thr > 0.99: raise ValueError("Threshold must be between 0.51 and 0.99") def _mex(self, txt, end="\n"): if self._verbose: print(txt, flush=True, end=end) else: if end == "\n": logger.debug("{}{}".format(self._leftover, txt)) self._leftover = "" else: self._leftover = txt def import_data(self, input_file, labelled=True): if labelled: X, features, names, Y = self._Xl, self._features_l, self._names_l, self._Y else: X, features, names, Y = self._Xu, self._features_u, self._names_u, None if not os.path.exists(input_file): raise FileNotFoundError("File {} not found!".format(input_file)) head_n = 2 if labelled else 1 with open(input_file, "r") as f: while head_n != 0: line = [v.strip() for v in f.readline().split(self._FS)] if line[0] == "group" or line[0] == "label": for idx in range(1, len(line)): Y.append(line[idx]) elif line[0] == "name": for idx in range(1, len(line)): names.append(line[idx]) else: raise ValueError( "File {} is not well formatted. Found {} as first field, should be 'name' or 'group'." .format(input_file, line[0])) head_n -= 1 line = f.readline() while line: line = [v.strip() for v in line.split(self._FS)] if len(line) == len(names) + 1: features.append(line[0]) vals = [] for idx in range(1, len(line)): try: vals.append(float(line[idx])) except ValueError: vals.append("NaN") X.append(vals) line = f.readline() def _initData(self): self._class_names = sorted(list(set(self._Y)), reverse=True) if len(self._class_names) != 2: raise ValueError( "Cell2Patients works only with binary classification.\n{} labels found: {}" .format(len(self._class_names), ", ".join(list(self._class_names)))) for i in range(len(self._Y)): self._Y[i] = self._class_names.index(self._Y[i]) if sorted(self._features_l) != sorted(self._features_u): raise ValueError( "Features in labelled and unlabelled are not the same!") self._features = self._features_l.copy() for i in range(len(self._features_l)): self._features_l[i] = self._features.index(self._features_l[i]) for i in range(len(self._features_u)): self._features_u[i] = self._features.index(self._features_u[i]) warnings.filterwarnings("ignore") self._Y = np.asarray(self._Y) self._Xl = SimpleImputer(strategy="mean").fit_transform( np.array(self._Xl)[self._features_l, :].T) self._Xu = SimpleImputer(strategy="mean").fit_transform( np.array(self._Xu)[self._features_u, :].T) if self._normalize: self._Xl = StandardScaler().fit_transform(self._Xl) self._Xu = StandardScaler().fit_transform(self._Xu) self._Ypatients = [-1 for _ in self._names_u] self._mex("Working with {} {} and {} {} cell lines".format( sum(self._Y), self._class_names[1], len(self._Y) - sum(self._Y), self._class_names[1])) def run(self): self._initData() self._mex( "Starting the tranfer learning of {} cell lines to {} patients". format(len(self._names_l), len(self._names_u))) self._patients_proba = [] self._cell_proba = [] running = True gen = 0 Xl = self._Xl.copy() Y = self._Y.copy() self._feature_importances = [] transferred = [] prev_novel = [1, 1] while running: gen += 1 self._mex("-" * 20) self._mex("Training the generation number {}".format(gen)) if gen == 1: clf = clone(self._clf) else: clf = clone(self._clf_transfer) clf.fit(Xl, Y) self._feature_importances.append(clf.feature_importances_) newX, newY, novel = self._evaluate(clf, transferred, gen * self._increment_rate) tot_pos = sum( [1 if v >= self._thr else 0 for v in self._patients_proba[-1]]) if novel[0] + novel[1] == 0: self._mex("No patients added.") running = False elif (prev_novel[0] == 0 and novel[0] == 0 and tot_pos == 0): self._mex( "Model is overfitting over {}. Stopping the iteration.". format(self._class_names[1])) running = False elif (prev_novel[1] == 0 and novel[1] == 0 and tot_pos == len(self._patients_proba[-1])): self._mex( "Model is overfitting over {}. Stopping the iteration.". format(self._class_names[0])) running = False else: self._mex("Adding {} patients, {} novel:".format( len(newY), novel[0] + novel[1])) self._mex(" - {} labelled as {} ( {} novel ) ".format( len(newY) - sum(newY), self._class_names[0], novel[0])) self._mex(" - {} labelled as {} ( {} novel ) ".format( sum(newY), self._class_names[1], novel[1])) self._patient_added.append([len(newY) - sum(newY), sum(newY)]) Xl = np.append(self._Xl, newX, axis=0) Y = np.append(self._Y, newY, axis=0) prev_novel = novel.copy() self._mex("-" * 20) self._write_results() return self.get_patient_labelled_data() def _evaluate(self, clf, transferred, limit): proba = clf.predict_proba(self._Xu)[:, 0] self._patients_proba.append(proba.copy()) self._cell_proba.append(clf.predict_proba(self._Xl)[:, 0]) newX, newY = [], [] proba = sorted(enumerate(proba), key=lambda tup: tup[1]) novel = [0, 0] i = min(limit, len(proba)) - 1 while i >= 0: if 1 - proba[i][1] >= self._thr: newX.append(list(self._Xu[proba[i][0], :])) newY.append(1) if not proba[i][0] in transferred: novel[1] += 1 transferred.append(proba[i][0]) self._Ypatients[proba[i][0]] = 1 i -= 1 i = len(proba) - 1 n = 0 while n < limit and proba[i][1] >= self._thr: newX.append(list(self._Xu[proba[i][0], :])) newY.append(0) if not proba[i][0] in transferred: novel[0] += 1 transferred.append(proba[i][0]) self._Ypatients[proba[i][0]] = 0 n += 1 i -= 1 return np.array(newX), np.array(newY), novel def _plot_importances(self, pdf): plt.close() top_10 = [[ v[0] for v in sorted(enumerate(self._feature_importances[0]), key=lambda tup: tup[1], reverse=True)[:10] ], [ v[0] for v in sorted(enumerate(self._feature_importances[-1]), key=lambda tup: tup[1], reverse=True)[:10] ]] for idx in top_10[0]: plt.plot(range(len(self._feature_importances)), [v[idx] for v in self._feature_importances], "-r", label="Top10 Begin") for idx in top_10[1]: plt.plot(range(len(self._feature_importances)), [v[idx] for v in self._feature_importances], "--b", label="Top10 End") plt.xlabel('Generation') plt.ylabel('Feature Importance') handles, labels = plt.gca().get_legend_handles_labels() newLabels, newHandles = [], [] for handle, label in zip(handles, labels): if label not in newLabels: newLabels.append(label) newHandles.append(handle) plt.legend(newHandles, newLabels) plt.title("Top10 feature importance evolution") plt.savefig(pdf, format="pdf") return def _plot_bar(self, pdf): plt.close() plt.bar([v - 0.21 for v in range(1, len(self._patient_added) + 1)], [v[0] for v in self._patient_added], width=0.40, color="#F8766D", label=self._class_names[0]) plt.bar([v + 0.21 for v in range(1, len(self._patient_added) + 1)], [v[1] for v in self._patient_added], width=0.40, color="#00BFC4", label=self._class_names[1]) plt.xlabel("Round") plt.ylabel("Number of patients added") plt.title("Patients added each round") plt.savefig(pdf, format="pdf") handles, labels = plt.gca().get_legend_handles_labels() newLabels, newHandles = [], [] for handle, label in zip(handles, labels): if label not in newLabels: newLabels.append(label) newHandles.append(handle) plt.legend(newHandles, newLabels) plt.legend() plt.close() def _plotPCA(self, pdf): X_pca = PCA(n_components=2, svd_solver='full').fit_transform(self._Xl) plt.close() colors = ['#F8766D', '#00BFC4', 'grey'] for i in range(2): plt.scatter(X_pca[self._Y == i, 0], X_pca[self._Y == i, 1], color=colors[i], lw=2, label="{}".format(self._class_names[i])) plt.legend() plt.title("Cell lines PCA") plt.savefig(pdf, format="pdf") plt.close() X_pca = PCA(n_components=2, svd_solver='full').fit_transform(self._Xu) class_names = self._class_names.copy() class_names.append("Unclassified") Y = np.array(self._Ypatients) for i in range(-1, 2): plt.scatter(X_pca[Y == i, 0], X_pca[Y == i, 1], color=colors[i], lw=2, label="{}".format(class_names[i])) plt.legend() plt.title("Patients PCA") plt.savefig(pdf, format="pdf") plt.close() def _plot_proba(self, pdf): plt.close() n_gen = len(self._patients_proba) plt.plot(range(n_gen), [self._thr for _ in range(n_gen)], "--", color="grey") plt.plot(range(n_gen), [1 - self._thr for _ in range(n_gen)], "--", color="grey") for gen in range(n_gen): blue, red, grey = [], [], [] for p in self._patients_proba[gen]: if p >= self._thr: red.append(p) elif p <= 1 - self._thr: blue.append(p) else: grey.append(p) if len(red) > 0: plt.plot([ gen + ((np.random.rand() - np.random.rand()) * 0.3) for _ in range(len(red)) ], red, ".", color="#F8766D", label=self._class_names[0]) if len(blue) > 0: plt.plot([ gen + ((np.random.rand() - np.random.rand()) * 0.3) for _ in range(len(blue)) ], blue, ".", color="#00BFC4", label=self._class_names[1]) if len(grey) > 0: plt.plot([ gen + ((np.random.rand() - np.random.rand()) * 0.3) for _ in range(len(grey)) ], grey, ".", color="grey", label="Unclassified") plt.xlabel("Generation") plt.ylabel("{} Probability".format(self._class_names[0])) plt.ylim(0, 1) plt.title("Patients probabilities") handles, labels = plt.gca().get_legend_handles_labels() newLabels, newHandles = [], [] for handle, label in zip(handles, labels): if label not in newLabels: newLabels.append(label) newHandles.append(handle) plt.legend(newHandles, newLabels) plt.savefig(pdf, format="pdf") plt.close() return def _plot_proba_cells(self, pdf): plt.close() n_gen = len(self._cell_proba) plt.plot(range(n_gen), [0.6 for _ in range(n_gen)], "--", color="grey") plt.plot(range(n_gen), [0.4 for _ in range(n_gen)], "--", color="grey") for gen in range(n_gen): blue, red, grey = [], [], [] for p in self._cell_proba[gen]: if p >= 0.6: red.append(p) elif p <= 0.4: blue.append(p) else: grey.append(p) if len(red) > 0: plt.plot([ gen + ((np.random.rand() - np.random.rand()) * 0.3) for _ in range(len(red)) ], red, ".", color="#F8766D") if len(blue) > 0: plt.plot([ gen + ((np.random.rand() - np.random.rand()) * 0.3) for _ in range(len(blue)) ], blue, ".", color="#00BFC4") if len(grey) > 0: plt.plot([ gen + ((np.random.rand() - np.random.rand()) * 0.3) for _ in range(len(grey)) ], grey, ".", color="grey") plt.xlabel("Generation") plt.ylabel("{} Probability".format(self._class_names[0])) plt.title("Cell lines probabilities") plt.ylim(0, 1) plt.savefig(pdf, format="pdf") plt.close() return def get_patient_labelled_data(self): X = [] Y = [] for p in range(len(self._names_u)): if self._Ypatients[p] != -1: Y.append(self._Ypatients[p]) X.append(self._Xu[p, :]) return np.array(X), np.array(Y) def get_patient_unlabelled_data(self): X = [] for p in range(len(self._names_u)): if self._Ypatients[p] == -1: X.append(self._Xu[p, :]) return np.array(X) def _write_results(self): pdf = PdfPages("{}/graphs.pdf".format(self._out_dir)) self._plot_importances(pdf) self._plot_proba(pdf) self._plot_proba_cells(pdf) self._plot_bar(pdf) self._plotPCA(pdf) pdf.close() with open("{}/feature_importances.tsv".format(self._out_dir), "w") as ofs: ofs.write("Generation\tFeatureName\tImportance\n") for gen in range(len(self._feature_importances)): for feat in range(len(self._features)): ofs.write("{}\t{}\t{:.3f}\n".format( gen, self._features[feat], self._feature_importances[gen][feat])) with open("{}/patients_probabilities.tsv".format(self._out_dir), "w") as ofs: ofs.write( "Generation\tPatientName\tProbability_{}\tProbability_{}\tGroup\n" .format(self._class_names[0], self._class_names[1])) for gen in range(len(self._patients_proba)): for p in range(len(self._names_u)): ofs.write("{}\t{}\t{}\t{}\n".format( gen, self._names_u[p], self._patients_proba[gen][p], 1 - self._patients_proba[gen][p])) with open("{}/cells_probabilities.tsv".format(self._out_dir), "w") as ofs: ofs.write("Generation\tCellName\tProbability_{}\tProbability_{}\n". format(self._class_names[0], self._class_names[1])) for gen in range(len(self._cell_proba)): for p in range(len(self._names_l)): ofs.write("{}\t{}\t{}\t{}\n".format( gen, self._names_l[p], self._cell_proba[gen][p], 1 - self._cell_proba[gen][p])) with open("{}/patients_labels.tsv".format(self._out_dir), "w") as ofs: ofs.write("Name\tLabel\n") for i in range(len(self._names_u)): ofs.write("{}\t{}\n".format( self._names_u[i], self._class_names[self._Ypatients[i]] if self._Ypatients[i] != -1 else "NA"))
def process_data(regressor_name, X, normalise=None): if regressor_name in classical_ml: tmp = [] for i in tqdm(range(len(X))): # 1. flatten # 2. fill missing values x = X.iloc[i, 0].reset_index(drop=True) x.interpolate(method='linear', inplace=True, limit_direction='both') if normalise == "standard": x = StandardScaler().fit_transform(x.values.reshape(-1, 1)) x = pd.DataFrame(x) elif normalise == "minmax": x = MinMaxScaler().fit_transform(x.values.reshape(-1, 1)) x = pd.DataFrame(x) tmp2 = x.values.tolist() for j in range(1, len(X.columns)): x = X.iloc[i, j].reset_index(drop=True) x.interpolate(method='linear', inplace=True, limit_direction='both') if normalise == "standard": x = StandardScaler().fit_transform(x.values.reshape(-1, 1)) x = pd.DataFrame(x) elif normalise == "minmax": x = MinMaxScaler().fit_transform(x.values.reshape(-1, 1)) x = pd.DataFrame(x) tmp2 = tmp2 + x.values.tolist() tmp2 = pd.DataFrame(tmp2).transpose() tmp.append(tmp2) X = pd.concat(tmp).reset_index(drop=True) else: tmp = [] for i in tqdm(range(len(X))): x = X.iloc[i, :] _x = x.copy(deep=True) # 1. find the maximum length of each dimension all_len = [len(y) for y in _x] max_len = max(all_len) # 2. adjust the length of each dimension _y = [] for y in _x: # 2.1 fill missing values if y.isnull().any(): y = y.interpolate(method='linear', limit_direction='both') # 2.2. if length of each dimension is different, uniformly scale the shorted one to the max length if len(y) < max_len: y = uniform_scaling(y, max_len) _y.append(y) _y = np.array(np.transpose(_y)) if normalise == "standard": scaler = StandardScaler().fit(_y) _y = scaler.transform(_y) if normalise == "minmax": scaler = MinMaxScaler().fit(_y) _y = scaler.transform(_y) tmp.append(_y) X = np.array(tmp) return X
clf2.fit(X_train) clf3 = AutoEncoder(hidden_neurons =[25, 15, 10, 2, 10,15, 25]) clf3.fit(X_train) # Predict the anomaly scores y_test_scores = clf1.decision_function(X_test) y_test_scores = pd.Series(y_test_scores) # Step 2: Determine the cut point import matplotlib.pyplot as plt plt.hist(y_test_scores, bins='auto') plt.title("Histogram with Model Clf3 Anomaly Scores") plt.show() df_test = X_test.copy() df_test['score'] = y_test_scores df_test['cluster'] = np.where(df_test['score']<4, 0, 1) df_test['cluster'].value_counts() # Step 3: Get the summary statistics by cluster df_test.groupby('cluster').mean() # ensemble # Put all the predictions in a data frame from pyod.models.combination import aom, moa, average, maximization # Put all the predictions in a data frame train_scores = pd.DataFrame({'clf1': clf1.decision_scores_,
def get_feature_patches(PARAMS, FV, patch_size, patch_shift, input_shape): # Removing NaN and Inf if any(np.array([9, 10, 21, 22, 39]) == np.shape(FV)[1]): FV = FV[~np.isnan(FV).any(axis=1), :] FV = FV[~np.isinf(FV).any(axis=1), :] else: FV = FV[:, ~np.isnan(FV).any(axis=0)] FV = FV[:, ~np.isinf(FV).any(axis=0)] FV = StandardScaler(copy=False).fit_transform(FV) # FV should be of the shape (nFeatures, nFrames) if any(np.array([9, 10, 21, 22, 39]) == np.shape(FV)[1]): FV = FV.T frmStart = 0 frmEnd = 0 patchNum = 0 patches = np.empty([]) if np.shape(FV)[1] < patch_size: FV1 = FV.copy() while np.shape(FV)[1] <= patch_size: FV = np.append(FV, FV1, axis=1) # while frmEnd<np.shape(FV)[1]: # # print('get_feature_patches: ', frmStart, frmEnd, np.shape(FV)) # frmStart = patchNum*patch_shift # frmEnd = np.min([patchNum*patch_shift+patch_size, np.shape(FV)[1]]) # if frmEnd-frmStart<patch_size: # frmStart = frmEnd - patch_size # if np.size(patches)<=1: # patches = np.expand_dims(FV[:, frmStart:frmEnd], axis=0) # else: # patches = np.append(patches, np.expand_dims(FV[:, frmStart:frmEnd], axis=0), axis=0) # patchNum += 1 # startTime = time.clock() # for frmStart in range(0, np.shape(FV)[1], patch_shift): # # print('get_feature_patches: ', frmStart, frmEnd, np.shape(FV)) # frmEnd = np.min([frmStart+patch_size, np.shape(FV)[1]]) # if frmEnd-frmStart<patch_size: # frmStart = frmEnd - patch_size # if np.size(patches)<=1: # patches = np.array(FV[:, frmStart:frmEnd], ndmin=3) # else: # patches = np.append(patches, np.array(FV[:, frmStart:frmEnd], ndmin=3), axis=0) # print('My splitting: ', time.clock()-startTime, np.shape(patches)) startTime = time.clock() numPatches = int(np.ceil(np.shape(FV)[1] / patch_shift)) patches = PatchExtractor(patch_size=(np.shape(FV)[0], patch_size), max_patches=numPatches).transform( np.expand_dims(FV, axis=0)) # print('sklearn splitting: ', time.clock()-startTime, np.shape(patches)) # print('Patches: ', np.shape(patches)) if (np.shape(patches)[1] == 9) or (np.shape(patches)[1] == 10): diff_dim = input_shape[0] - np.shape(patches)[1] zero_padding = np.zeros( (np.shape(patches)[0], diff_dim, np.shape(patches)[2])) patches = np.append(patches, zero_padding, axis=1) elif np.shape(patches)[1] == 22: patches = patches[:, :21, :] elif np.shape(patches)[1] == 39: if not PARAMS['39_dim_CC_feat']: first_7_cep_dim = np.array( list(range(0, 7)) + list(range(13, 20)) + list(range(26, 33))) patches = patches[:, first_7_cep_dim, :] # print('Patches: ', np.shape(patches)) return patches
from tempfile import mkdtemp from functools import wraps import pytest from sklearn import datasets import warnings n_clusters = 3 # X = generate_clustered_data(n_clusters=n_clusters, n_samples_per_cluster=50) X, y = make_blobs(n_samples=200, random_state=10) X, y = shuffle(X, y, random_state=7) X = StandardScaler().fit_transform(X) X_missing_data = X.copy() X_missing_data[0] = [np.nan, 1] X_missing_data[5] = [np.nan, np.nan] def test_missing_data(): """Tests if nan data are treated as infinite distance from all other points and assigned to -1 cluster""" model = HDBSCAN().fit(X_missing_data) assert model.labels_[0] == -1 assert model.labels_[5] == -1 assert model.probabilities_[0] == 0 assert model.probabilities_[5] == 0 assert model.probabilities_[5] == 0 clean_indices = list(range(1, 5)) + list(range(6, 200)) clean_model = HDBSCAN().fit(X_missing_data[clean_indices]) assert np.allclose(clean_model.labels_, model.labels_[clean_indices])
def load_data(path, file_name): with open(path + file_name + '.pickle', 'rb') as handle: data = pickle.load(handle) return data # ------------------------------------------------------------------------------------------------------------- orient = 9 pix_per_cell = 8 cell_per_block = 2 hist_bins = 32 X_scaler = StandardScaler() X_scaler.copy = True X_scaler.with_mean = True X_scaler.with_std = True filename = 'svm_model.sav' data_vehicle = load_data("./data/", "Vehicle") data_no_vehicle = load_data("./data/", "No_Vehicle") ones = np.ones(len(data_vehicle)) zeros = np.zeros(len(data_no_vehicle)) feature_list = [ones, zeros] label = np.hstack(feature_list) # for idx in range(len(data_no_vehicle)): # img = data_no_vehicle[idx]
coef[:n_relevant_features] = coef_min + rng.rand(n_relevant_features) # The correlation of our design: variables correlated by blocs of 3 corr = np.zeros((n_features, n_features)) for i in range(0, n_features, block_size): corr[i:i + block_size, i:i + block_size] = 1 - conditioning corr.flat[::n_features + 1] = 1 corr = linalg.cholesky(corr) # Our design X = rng.normal(size=(n_samples, n_features)) X = np.dot(X, corr) # Keep [Wainwright2006] (26c) constant X[:n_relevant_features] /= np.abs( linalg.svdvals(X[:n_relevant_features])).max() X = StandardScaler().fit_transform(X.copy()) # The output variable y = np.dot(X, coef) y /= np.std(y) # We scale the added noise as a function of the average correlation # between the design and the output variable y += noise_level * rng.normal(size=n_samples) mi = mutual_incoherence(X[:, :n_relevant_features], X[:, n_relevant_features:]) ########################################################################### # Plot stability selection path, using a high eps for early stopping # of the path, to save computation time alpha_grid, scores_path = lasso_stability_path(X, y, random_state=42, eps=0.05)
keras.layers.Dense(240, activation=tf.nn.relu), keras.layers.Dropout(0.2), keras.layers.Dense(240, activation=tf.nn.relu), keras.layers.Dropout(0.1), keras.layers.Dense(240, activation=tf.nn.tanh), keras.layers.Dropout(0.1), keras.layers.Dense(5, activation=tf.nn.softmax), ]) model.name = 'model' + str(cont) model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy']) ### drop outliers tmp = pd.DataFrame(data=X.copy()) tmp.insert(0, column='y', value=y) X_curr, y_curr = drop_outliers(tmp, cont) # model.fit(X_curr, y_curr, epochs = 50) model.fit(X_curr, y_curr, epochs=200) # results[cont] = model.evaluate(X_test, y_test) # print("done with ", model.name) # print(results[cont]) # results ={} # # exit()