def _get_node_distance_matrix(self, datapoint, som_array): """Get distance of datapoint and node using Euclidean distance. Parameters ---------- datapoint : np.array, shape=(X.shape[1]) Datapoint = one row of the dataset `X` som_array : np.array Weight vectors of the SOM, shape = (self.n_rows, self.n_columns, X.shape[1]) Returns ------- distmat : np.array of float Distance between datapoint and each SOM node """ # algorithms on the full matrix if self.distance_metric == "euclidean": return np.linalg.norm(som_array - datapoint, axis=2) # node-by-node algorithms distmat = np.zeros((self.n_rows, self.n_columns)) if self.distance_metric == "manhattan": for node in self.node_list_: distmat[node] = dist.cityblock( som_array[node[0], node[1]], datapoint) elif self.distance_metric == "mahalanobis": for node in self.node_list_: som_node = som_array[node[0], node[1]] cov = np.cov(np.stack((datapoint, som_node), axis=0), rowvar=False) cov_pinv = np.linalg.pinv(cov) # pseudo-inverse distmat[node] = dist.mahalanobis( datapoint, som_node, cov_pinv) elif self.distance_metric == "tanimoto": # Note that this is a binary distance measure. # Therefore, the vectors have to be converted. # Source: Melssen 2006, Supervised Kohonen networks for # classification problems # VERY SLOW ALGORITHM!!! threshold = 0.5 for node in self.node_list_: som_node = som_array[node[0], node[1]] distmat[node] = dist.rogerstanimoto( binarize(datapoint.reshape(1, -1), threshold=threshold, copy=True), binarize(som_node.reshape(1, -1), threshold=threshold, copy=True)) elif self.distance_metric == "spectralangle": for node in self.node_list_: distmat[node] = np.arccos(np.divide( np.dot(som_array[node[0], node[1]], datapoint), np.multiply(np.linalg.norm(som_array), np.linalg.norm(datapoint)))) return distmat
def predict(self, X): ''' Predict class labels. ''' if self.mode == 'average': return binarize(self.predict_proba(X)[:,[1]], 0.5) else: res = binarize(X, 0.5) return np.apply_along_axis(lambda x: np.bincount(x.astype(int), self.weights).argmax(), axis=1, arr=res)
def binarize_image(image, method='li', **kwargs): """Binarize image using one of the available methods: 'isodata', 'li', 'otsu', 'sauvola', and 'boolean'. Defaults to 'li'. Extra keyword arguments are passed in as is to the corresponding scikit-image thresholding function. The 'boolean' method refers to simple thresholding from a grey-scale image. If a 'threshold' kwarg is not passed to the 'boolean' method, 'li' thresholding is performed. For reference Sezgin M. and Sankur B. (2004) "Survey over Image Thresholding Techniques and Quantitative Performance Evaluation" Journal of Electronic Imaging, 13(1): 146-165 DOI:10.1117/1.1631315 """ if image.ndim != 2: # image is not gray-scale image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) if np.unique(image).size == 2: # image is already binary return image boolean_threshold = kwargs.get('threshold', None) if method == 'boolean' and boolean_threshold: preprocessing.binarize(image, threshold=boolean_threshold, copy=False) return convert(image) if method not in ('sauvola', 'isodata', 'otsu', 'li'): method = 'li' thresh_func = getattr(filters.thresholding, "threshold_{}".format(method)) threshold = thresh_func(image, **kwargs) # OpenCV can't write black and white images using boolean values, it needs # at least a 8bits 1-channel image ranged from 0 (black) to 255 (white) return convert(image <= threshold)
def binarize_encode_target_columns(apache_df_list): """ Function to binarize predictions and encode the actual target columns for the apache prediction tables NOTE - all column names are the same names for when they were queried from the database. Will not work if you have renamed predictedicumortality,predictedhospitalmortality actualicumortality,actualhospitalmortality. If you have, please rename them back or change them directly from this function. Parameters ------------ apache_df_list: list of dataframe objects The dataframes for which we will be performing the operations on, given that Returns ------------ None, directly makes changes to the dataframes listed in apache_df_list. Four new columns will be added icu_death_prediction_label : class labels from the predictedicumortality column hosp_death_predictions_label : class labels from the predictedhospitalmortality column icu_deaths : class labels for the actualicumortality column hosp_deaths : class labels for the actualhospitalmortality column """ # Grab the dataframes apache_df_list = apache_df_list # set the threshold threshold = 0.5 # loop through the dataframes binarize predictions and encode labels for established truth for df in apache_df_list: # binarize predictions icu_death_predictions = binarize( df['predictedicumortality'].values.reshape(-1, 1), threshold=threshold) hosp_death_predictions = binarize( df['predictedhospitalmortality'].values.reshape(-1, 1), threshold=threshold) df['icu_death_prediction_label'] = icu_death_predictions df['hosp_death_prediction_label'] = hosp_death_predictions # encode lobels for actual data df['icu_deaths'] = df['actualicumortality'].map( lambda status: 0 if status == 'ALIVE' else 1) df['hosp_deaths'] = df['actualhospitalmortality'].map( lambda status: 0 if status == 'ALIVE' else 1)
def train(self, X_train, y_train, silent = False): '''train the model, X_train contains the tweet in each row''' if self.useTfIdf: self.vectorizer = TfidfVectorizer(ngram_range=(self.ngram_s, self.ngram_e), tokenizer=lambda x: x.split(), lowercase=False, preprocessor=lambda x: x) else: self.vectorizer = CountVectorizer(ngram_range=(self.ngram_s, self.ngram_e), tokenizer=lambda x: x.split(), lowercase=False, preprocessor=lambda x: x) if self.multinomial: self.model = MultinomialNaiveBayes() else: self.model = BernoulliNaiveBayes() self.vectorizer.fit(X_train.astype('str')) #assert len(self.vectorizer.stop_words_) == 0 #we don't want preprocess by scikit learn, we already performed it #print(self.vectorizer.get_feature_names()) if not silent: print('vectorizer trained') X_train_bow = self.vectorizer.transform(X_train.astype('str')) if not self.multinomial: binarize(X_train_bow, copy=False) if not silent: print('train data vectorized') self.model.train(X_train_bow, y_train) if not silent: print('model trained')
def getSrlRepresentation(cas, intensity=False, log=False, bnrz=False, representationSize=200): from sklearn.preprocessing import binarize model = models.Word2Vec.load('models/word2vec/srlModel') ret = [None] * len(cas.sentences) for i, sentence in enumerate(cas.srlSentences): numRows = sum([len(clause) for clause in sentence]) altSentence = np.zeros((numRows, representationSize)) currentRow = 0 for clause in sentence: for j, (role, text) in enumerate(clause.iteritems()): word = str((role, text)) try: altWord = np.multiply( np.add(np.divide(model[word], 2.0), 0.5), 255) if intensity else model[word] altWord = np.multiply( binarize(altWord, threshold=255.0 / 2.0), 255) if bnrz and intensity else altWord altWord = binarize( altWord) if bnrz and not intensity else altWord altSentence[currentRow, :] = altWord except: altSentence[currentRow, :] = altSentence[ j - 1, :] if j != 0 else np.zeros(representationSize) currentRow += 1 ret[i] = altSentence return ret
def main(): logging.info(u"Getting clusters data") uid_to_ug = get_ug_data(args.user_cluster) bid_to_bg, bg_iids = get_bg_data(args.booking_cluster) logging.info("Reading training data") training_df = pd.read_csv(args.training_csv) tr_m = get_matrix(training_df, uid_to_ug, bid_to_bg) logging.info(u"Training matrix: %s", get_sparse_matrix_info(tr_m)) logging.info("Reading testing data") # we don't care about repetitive actions in the testing testing_df = pd.read_csv(args.testing_csv)[["code", "propcode"]].drop_duplicates() logging.info("Preparing similarity matrix") sim_m = get_similarity_matrix(tr_m) logging.info("Testing hit ratio at top-%s", args.top_k) recs_m = get_topk_recs(tr_m, sim_m, binarize(tr_m), args.top_k) logging.info(u"Hit ratio: %.3f", hit_ratio(recs_m, testing_df, uid_to_ug, bg_iids)) if args.top_k_iid_per_uid: recs_m = get_topk_recs(tr_m, sim_m, binarize(tr_m)) store_data_for_eval(recs_m, testing_df, uid_to_ug, bg_iids)
def test_preprocessing_assignment(self): iris = datasets.load_iris() df = pdml.ModelFrame(iris) original_columns = df.data.columns df['sepal length (cm)'] = df[ 'sepal length (cm)'].preprocessing.binarize(threshold=6) self.assertIsInstance(df, pdml.ModelFrame) binarized = pp.binarize(np.atleast_2d(iris.data[:, 0]), threshold=6) expected = np.hstack([binarized.T, iris.data[:, 1:]]) self.assert_numpy_array_almost_equal(df.data.values, expected) tm.assert_index_equal(df.data.columns, original_columns) # recreate data iris = datasets.load_iris() df = pdml.ModelFrame(iris) target_columns = ['sepal length (cm)', 'sepal width (cm)'] df[target_columns] = df[target_columns].preprocessing.binarize( threshold=6) self.assertIsInstance(df, pdml.ModelFrame) binarized = pp.binarize(iris.data[:, 0:2], threshold=6) expected = np.hstack([binarized, iris.data[:, 2:]]) self.assert_numpy_array_almost_equal(df.data.values, expected) tm.assert_index_equal(df.data.columns, original_columns)
def superimpose_two_masks(mask_fn1, mask_fn2): img_in = cv2.imread(mask_fn1, cv2.IMREAD_GRAYSCALE) img_in = binarize(img_in, threshold=50, copy=True) img_side = cv2.imread(mask_fn2, cv2.IMREAD_GRAYSCALE) img_side = binarize(img_side, threshold=50, copy=True) composite = cv2.bitwise_or(img_in,img_side) return composite
def binarize(pred, threshold=0.5): # Batch_wise if pred.ndim == 3: return np.array( [pre.binarize(sub, threshold=threshold) for sub in pred]) else: return pre.binarize(pred, threshold=threshold)
def bns(X, y): """ Implements the bi-normal separation scoring. """ # binarization: from counts to presence/abscence binarize(X, threshold=0.0, copy=False) # one column per class Y = LabelBinarizer().fit_transform(y) if Y.shape[1] == 1: # binary problem case Y = np.append(1-Y, Y, axis=1) pos = np.sum(Y, axis=0) neg = Y.shape[0] - pos tp = safe_sparse_dot(X.T, Y) fp = np.sum(tp, axis=1).reshape(-1, 1) - tp tpr = bounded(tp/pos.astype(float)) fpr = bounded(fp/neg.astype(float)) bns = np.abs(_z_score(tpr) - _z_score(fpr)) return bns[:,1], None
def jaccard_sim(self): '''given a sparse matrix, calculate jaccard sim ** ref : http://na-o-ys.github.io/others/2015-11-07-sparse-vector-similarities.html ''' if self.kind == 'ubcf': # assure binarize sp matrix and astype int16 mat = binarize(self.inter).astype('int16') elif self.kind == 'ibcf': # assure binarize sp matrix and astype int16 mat = binarize(self.inter.T).astype('int16') rows_sum = mat.getnnz(axis=1).astype('int16') # ab = mat.dot(mat.T).astype('float16') # mat x t(mat) # for rows aa = np.repeat(rows_sum, ab.getnnz(axis=1)) # for columns bb = rows_sum[ab.indices] similarities = ab.tocoo(copy=True) similarities.data /= (aa + bb - ab.data) del aa, bb, ab # large memory cost similarities = similarities.astype('float32') # similarities.setdiag(0) ## similarities = similarities.tocsr() similarities.eliminate_zeros() sparsity = float(similarities.nnz / mat.shape[0]**2) * 100 print( 'similarity (jaccard) matrix built ({}), \nsparsity of similarity: {:.2f} %' .format(self.kind, sparsity)) self.sim = similarities
def test_binarize(self): iris = datasets.load_iris() df = pdml.ModelFrame(iris) result = df.preprocessing.binarize() expected = pp.binarize(iris.data) self.assertTrue(isinstance(result, pdml.ModelFrame)) self.assert_numpy_array_almost_equal(result.data.values, expected) self.assert_index_equal(result.columns, df.data.columns) result = df.preprocessing.binarize(threshold=5) expected = pp.binarize(iris.data, threshold=5) self.assertTrue(isinstance(result, pdml.ModelFrame)) self.assert_numpy_array_almost_equal(result.data.values, expected) self.assert_index_equal(result.columns, df.data.columns) s = df['sepal length (cm)'] self.assertTrue(isinstance(s, pdml.ModelSeries)) result = s.preprocessing.binarize() expected = pp.binarize(iris.data[:, 0])[0] self.assertTrue(isinstance(result, pdml.ModelSeries)) self.assert_numpy_array_almost_equal(result.values, expected) self.assertEqual(result.name, 'sepal length (cm)') result = s.preprocessing.binarize(threshold=6) expected = pp.binarize(iris.data[:, 0], threshold=6)[0] self.assertTrue(isinstance(result, pdml.ModelSeries)) self.assert_numpy_array_almost_equal(result.values, expected) self.assertEqual(result.name, 'sepal length (cm)')
def ig(X, y): """ This method calculates the information gain for two random variables I(X, Y). """ # binarization: from counts to presence/abscence binarize(X, threshold=0.0, copy=False) # una columna por cada clase Y = LabelBinarizer().fit_transform(y) if Y.shape[1] == 1: # binary problem case Y = np.append(1-Y, Y, axis=1) Y_prob = (np.sum(Y, axis=0, dtype=np.float64) / len(Y)).reshape(-1, 1) # calculate the class entropy H(Y) class_entropy = _entropy(Y_prob) X_y_count = safe_sparse_dot(Y.T, X) # TODO XXX FIXME ver si estoy calculando bien esta probabilidad X_y_prob = \ X_y_count / np.sum(X_y_count, axis=0, dtype=np.float64) # calculate the conditional entropy of the class given the feature H(y|f_i) cond_entropy = _entropy(X_y_prob) # TODO XXX FIXME ver si estoy calculando bien la entropia condicional print "class:", class_entropy print "cond_entropy:", cond_entropy infogain = class_entropy - cond_entropy return infogain, None
def greedy_cailp(positive_coverage, negative_coverage, k=20): num_features = positive_coverage.shape[1] num_positive_tweets = positive_coverage.shape[0] num_negative_tweets = negative_coverage.shape[0] positive_bin = binarize(positive_coverage) negative_bin = binarize(negative_coverage) positive_lil = positive_bin.tolil() negative_lil = negative_bin.tolil() selected_features = [] for i in range(k): print(i) scores = (positive_lil.sum(axis=0) / float(num_positive_tweets)) \ - (negative_lil.sum(axis=0) / float(num_negative_tweets)) selected_feature = scores.argmax() if selected_feature not in selected_features: covered_pos_tweets = list( positive_lil[:, selected_feature].nonzero()[0]) utils.delete_row_lil(positive_lil, covered_pos_tweets) covered_neg_tweets = list( negative_lil[:, selected_feature].nonzero()[0]) utils.delete_row_lil(negative_lil, covered_neg_tweets) selected_features.append(selected_feature) else: break return selected_features
def main(): logging.info("Reading training data") training_df = pd.read_csv(args.training_csv) tr_m, uid_to_row, iid_to_col = get_training_matrix_and_indices(training_df) logging.info("Training matrix: %s", get_sparse_matrix_info(tr_m)) logging.info("Reading testing data") testing_df = pd.read_csv(args.testing_csv)[["code", "propcode"]].drop_duplicates() logging.info("Preparing similarity matrix") sim_m = get_similarity_matrix(tr_m) logging.info("Testing hit ratio at top-%s", args.top_k) recs_m = get_topk_recs( normalize(tr_m), sim_m, binarize(tr_m), args.top_k, ) logging.info("Hit ratio: %.3f", hit_ratio(recs_m, testing_df, uid_to_row, iid_to_col)) if args.top_k_iid_per_uid: recs_m = get_topk_recs( tr_m, sim_m, binarize(tr_m) ) store_data_for_eval(recs_m, testing_df, uid_to_row, iid_to_col)
def meanThreshold(adata, groupby, threshold, return_df=False, layer=None, use_raw=False, transformation="log1p"): """Binarize gene expression for groups aggregated by mean. Returns: adata object with updated uns.gene_call """ from sklearn.preprocessing import binarize import pandas as pd df = get_adata_df(adata, layer=layer, use_raw=use_raw, transformation=transformation) result = df.groupby(by=adata.obs[groupby], axis=1).mean() binarize(result, threshold=threshold, copy=False) if return_df is True: return result else: adata.uns.update({"gene_call": result}) return adata
def get_prediction_metrics(self): print("Getting prediction metrics") df = self.get_predictions_as_df(self.predictions) metrics = {} prediction_metrics = {} annotation_metrics = {} prediction_metrics["event_count"] = len(df["start"]) prediction_metrics["mean_duration"] = df["duration"].mean() if len( df["start"]) > 0 else 0 # Hour * hz prediction_metrics[ "recording_length_minutes"] = self.last_predicted_index / (60 * 10) if prediction_metrics["recording_length_minutes"] > 0: prediction_metrics["calculated_ahi"] = ( prediction_metrics["event_count"] / prediction_metrics["recording_length_minutes"]) * 60 metrics["prediction"] = prediction_metrics if self.ground_truth is not None: df = self.get_predictions_as_df(self.ground_truth) annotation_metrics["event_count"] = len(df["start"]) annotation_metrics["mean_duration"] = df["duration"].mean() if len( df["start"]) > 0 else 0 annotation_metrics[ "annotation_length_minutes"] = self.ground_truth_length / (60 * 10) metric_end = int( float(max(self.ground_truth_length, self.last_predicted_index))) if annotation_metrics["annotation_length_minutes"] > 0: annotation_metrics["calculated_ahi"] = ( annotation_metrics["event_count"] / annotation_metrics["annotation_length_minutes"]) * 60 predictions = self.predictions[:metric_end] ground_truth = self.ground_truth[:metric_end] ground_truth_binary = np.ravel( binarize(ground_truth.reshape(1, -1), 0)) predictions_binary = np.ravel( binarize(predictions.reshape(1, -1), 0)) annotation_metrics["accuracy_score"] = accuracy_score( ground_truth_binary, predictions_binary) annotation_metrics["f1_score"] = f1_score(ground_truth_binary, predictions_binary) annotation_metrics["precision_score"] = precision_score( ground_truth_binary, predictions_binary) annotation_metrics["recall_score"] = recall_score( ground_truth_binary, predictions_binary) metrics["annotation"] = annotation_metrics return metrics
def partition(features, a, probs, w): ig = 0 ap = {} # multiply a by row and re-sparsify x = features.index(w) ap['yes'] = a.multiply(binarize(a[x])).tocsr() # a_no is whatever's left of 'a' after removing a_yes ap['no'] = a - ap['yes'] # sum a's columns and binarize qk = binarize(a.sum(axis=0))[0] pk = {} pk['no'] = binarize(ap['no'].sum(axis=0))[0] # pk['yes'] is whatever's left of qk after removing pk['no'] pk['yes'] = qk - pk['no'] ap['yes'] = ap['yes'].multiply(pk['yes'].reshape(-1, 1).T).tocsr() # for qk and both pk's, multiply by static probs vector, then normalize qk = qk * probs ig_c = {} ig_uc = {} ig_c['yes'] = 0.0 ig_uc['yes'] = 0.0 ig_c['no'] = 0.0 ig_uc['no'] = 0.0 if np.sum(qk) > 0: qk_num = len(np.where(qk != 0)[0]) qk = normalize(qk) if VERBOSE: print(CRED + 'qk ' + str(qk_num) + CEND, '\n', a.A, '\n', qk) for d in ['yes', 'no']: pk[d] = pk[d] * probs if np.sum(pk[d]) > 0: pkd_nz = np.where(pk[d] != 0)[0] pk_num = len(pkd_nz) pk[d] = normalize(pk[d]) qk_nz = np.where(qk != 0)[0] ig_uc[d] = entropy(pk=pk[d][qk_nz], qk=qk[qk_nz], base=2) ig_c[d] = (pk_num / qk_num) * ig_uc[d] ig += ig_c[d] if VERBOSE: print(CRED + 'pk[' + d + '] ' + str(pk_num) + CEND, '\n', ap[d].A, '\n', pk[d]) if VERBOSE: print(ig) return ig, ap['yes'], ig_uc, ig_c
def binarizer(): a = [[-1, 3, -2], [5, -7, -4]] b = preprocessing.binarize(a) print(b) print(preprocessing.binarize(a, threshold=-2)) bin = preprocessing.Binarizer() print(bin.transform(a))
def perform_test(self, X_test, silent=False): X_test_bow = self.vectorizer.transform(X_test.astype('str')) if not self.multinomial: binarize(X_test_bow, copy=False) if not silent: print('test data vectorized') y_score = self.model.multi_prediction_score(X_test_bow) y_pred = self.model.multi_predict_class_from_score(y_score, threshold=self.threshold) return y_score, y_pred
def predict(self, X): ''' Predict class labels. ''' if self.mode == 'average': return binarize(self.predict_proba(X)[:, [1]], 0.5) else: res = binarize(X, 0.5) return np.apply_along_axis( lambda x: np.bincount(x.astype(int), self.weights).argmax(), axis=1, arr=res)
def Binarize(self, column = None): """ Feature Binarization, tresholding numerical features to get boolean values """ try: if column == None: _dataset = preprocessing.binarize(_dataset) else: _dataset[column] = preprocessing.binarize(_dataset[column]) except Exception as n: print("Binarize failed!") print(n)
def multibinarize(x, thresholds): if hasattr(x, "fillna"): x = x.fillna(0).values.reshape(-1, 1) else: x = x.reshape(-1, 1) res = None for threshold in thresholds: if res is None: res = binarize(x, threshold) else: res += binarize(x, threshold) return res[:, 0]
def process_nps(nps): print("processing meanings ...") probs = [] pairs = [] features = [] chunks = [] vectors = [] chunk_size = 1000 # add adjective features for alist in nps.adjs.unique(): for w in alist.split(','): if w not in features: features.append(w) # add noun features for w in nps.noun.unique(): if w not in features: features.append(w) # create vector for each NP total = len(nps) for i, row in nps.iterrows(): print_progress(i + 1, total) vector = [0] * len(features) vector[features.index(row['noun'])] = 1 for adj in row['adjs'].split(','): vector[features.index(adj)] = 1 if WEIGHTED_PROBS: probs.append(np.clip(row['count'], 0, 100)) else: probs.append(1) vectors.append(vector) if len(vectors) > chunk_size: chunks.append(csr_matrix(binarize(np.array(vectors).T)).tocsr()) vectors = [] chunks.append(csr_matrix(binarize(np.array(vectors).T)).tocsr()) print("") print("combining vectors...") a_orig = hstack(chunks).tocsr() print("normalizing probabilities ...") probs = normalize(np.array(probs)) print('total feature vectors:', len(probs)) return features, probs, a_orig
def get_score(X, y, clf, scoring = 'accuracy'): from sklearn.preprocessing import binarize if scoring == 'accuracy': from sklearn.metrics import accuracy_score score = accuracy_score(y, binarize(clf.predict(X), 0.5)) elif scoring =='f1': from sklearn.metrics import f1_score score = f1_score(y, binarize(clf.predict(X), 0.5)) else: score = clf.score(X, y) return score
def binary_bow(self, n=None): data_test = self.data_test data_train = self.data_train if (n): X_tr = binarize(np.array(data_test[0][0:n].todense())) X_te = binarize(np.array(data_train[0][0:n].todense())) small_test = X_tr, data_test[1][0:n] small_train = X_te, data_train[1][0:n] return small_train, small_test return data_train, data_test
def evaluate(self, pred_all, test, method='precision'): """ params ====== pred_all:(ndarray) predicted/recommended result for each user test:(csr_matrix) testing sets(test.shape[0] should be same as pred_all.shape[0]) method: (str) precision(default), recall evaluate method attribute ========= precision recall """ assert type(test) == sp.csr_matrix assert test.shape[0] == pred_all.shape[0] if method == 'precision': test_lil = binarize(test).tolil() # binarize and tranform to lil prec_array = np.zeros(pred_all.shape[0]) num_of_test_data = 0 for user, items in enumerate(test_lil.rows): prec_array[user] = len(np.intersect1d( items, pred_all[user, ])) / len(pred_all[user, ]) if items != []: num_of_test_data += 1 # return np.sum(prec_array)/num_of_test_data self.precision = np.sum(prec_array) / num_of_test_data print("\n-------------") print("model: {},\ntopN: {}".format(self.kind, self.topN)) print("precision: {:.2f} %".format(self.precision * 100)) if method == 'recall': test_coo = binarize(test).tocoo() # binarize and transform to coo score = 0 nonzero_rowsets = set(test_coo.row) for row, col, v in zip(test_coo.row, test_coo.col, test_coo.data): if col in pred_all[row, ]: score += 1 self.recall = score / len(nonzero_rowsets) print("\n-------------") print("model: {},\ntopN: {}".format(self.kind, self.topN)) print("recall:{:.2f} %".format(score / len(test_coo.data) * 100))
def get_recs(self, ug_id, iid_recs, top_clusters=None, min_iid_per_bg=None): bg_recs_row = self.ug_bg_recs_m[ug_id] bg_mask = binarize( self.item_dp.get_iid_per_bg_row(binarize(iid_recs), min_iid_per_bg) ) bg_recs_row = bg_recs_row.multiply(bg_mask) if top_clusters is not None: arg_ids = np.argsort(bg_recs_row.data)[-top_clusters:] rows, cols = bg_recs_row.nonzero() bg_recs_row = csr_matrix( (bg_recs_row.data[arg_ids], (rows[arg_ids], cols[arg_ids])), shape=bg_recs_row.shape ) return bg_recs_row
def getRepresentation(cas, intensity=False, log=False, bnrz=False, representationSize=200): from sklearn.preprocessing import binarize ret = [None]*len(cas.tokens) for i, sentence in enumerate(cas.tokens): altSentence = np.zeros((len(sentence.split()), representationSize)) for j, word in enumerate(sentence.split()): try: altWord = np.multiply(np.add(np.divide(model[word], 2.0), 0.5), 255) if intensity else model[word] altWord = np.multiply(binarize(altWord, threshold=255.0/2.0), 255) if bnrz and intensity else altWord altWord = binarize(altWord) if bnrz and not intensity else altWord altSentence[j,:] = altWord except: altSentence[j,:] = altSentence[j-1,:] if j != 0 else np.zeros(representationSize) ret[i] = altSentence return ret
def PreprocessingData(processType): if processType=="Normalization": AlgorithmOperation.train_X = preprocessing.normalize(AlgorithmOperation.train_X, norm='l2') AlgorithmOperation.test_X = preprocessing.normalize(AlgorithmOperation.test_X, norm='l2') elif processType=="Scale": AlgorithmOperation.train_X =preprocessing.scale(AlgorithmOperation.train_X) AlgorithmOperation.test_X =preprocessing.scale(AlgorithmOperation.test_X) elif processType=="Binarization": AlgorithmOperation.train_X =preprocessing.binarize(AlgorithmOperation.train_X) AlgorithmOperation.test_X =preprocessing.binarize(AlgorithmOperation.test_X) elif processType=="Polynomial Feature": poly=preprocessing.PolynomialFeatures(2) AlgorithmOperation.train_X = poly.fit_transform(AlgorithmOperation.train_X) AlgorithmOperation.test_X = poly.fit_transform(AlgorithmOperation.test_X)
def example2(): """方法2[推荐] """ X = np.array([[1, -1, 2], ## "f"非常重要,为了标准化,矩阵元素必须是浮点类型 [2, 0, 0], [0, 1, -1]], dtype = "f") print("binarized X = \n%s\n" % preprocessing.binarize(X, threshold=1.1))
def train_step(self, x, y): # perform following steps: # -reset the gradients # -propagate through the network # -calculate the loss # -compute gradient by backward propagation # -update weights # -return the loss # TODO if self._cuda: x = x.clone().detach().cuda() y = y.clone().detach().cuda().squeeze() #x = t.tensor(x, dtype=t.float).cuda() #y = t.tensor(y, dtype=t.float).cuda().squeeze() self._optim.zero_grad() y_pred = self._model(x) y_predTmp = y_pred.clone() y_predTmp = binarize(y_predTmp.cpu().detach().numpy(), threshold=0.5) # numpy array w/o grad y_pred.data = t.tensor(y_predTmp, dtype=t.float).cuda() loss = self._crit(y_pred, y.float()) loss.backward() self._optim.step() return loss
def test_model(LRM, data, TBI=False): global GAMMA y = list(data.TBResult.values) test_recs = list(data.StudyNum.values) X = data.drop(['StudyNum','TBResult'],axis=1) probs = LRM.predict_proba(X)[:,1] """ Calculate AUC acc using ROC analysis """ # Get FPR and TPR for the test set fpr, tpr, thresh = roc_curve(y,probs) # Calc AUC acc auc_acc = auc(fpr,tpr) pred = map(int,binarize(np.array(probs).reshape(1,-1),threshold = GAMMA)[0]) if not TBI: ACC,SENS,SPEC = eval_model(pred, y) return [ACC,SENS,SPEC,auc_acc] else: ACC,SENS,SPEC = eval_model(pred, y, probs=probs, test_px=test_recs, TBI=True) return [ACC,SENS,SPEC]
def run_test(seqs, label_seqs, sess, preds_T, input_PHs, label_PHs, mask_PHs, seq_length_PH, loss_T, options): all_losses = [] all_preds = [] all_labels = [] batch_size = options['batch_size'] for idx in xrange(len(label_seqs) / batch_size): batch_x = seqs[idx * batch_size:(idx + 1) * batch_size] batch_y = label_seqs[idx * batch_size:(idx + 1) * batch_size] inputs, _, masks, seq_length = mime_util.st_preprocess_hf_aux( batch_x, options) preds, loss = sess.run( [preds_T, loss_T], feed_dict={ input_PHs[0]: inputs[0], input_PHs[1]: inputs[1], input_PHs[2]: inputs[2], mask_PHs[0]: masks[0], mask_PHs[1]: masks[1], mask_PHs[2]: masks[2], label_PHs[-1]: batch_y, seq_length_PH: seq_length, }) all_losses.append(loss) all_preds.extend(list(preds)) all_labels.extend(batch_y) auc = roc_auc_score(all_labels, all_preds) aucpr = average_precision_score(all_labels, all_preds) accuracy = (np.array(all_labels) == np.squeeze( binarize(np.array(all_preds).reshape(-1, 1), threshold=.5))).mean() return np.mean(all_losses), auc, aucpr
def resc(patch): """ :param patch: [image,mask] :return: random rescaling of the pair [image,mask] --- Rescaling reinforces axons size diversity --- """ s = random.choice([0.5, 0.75, 1.0, 1.5, 2.0]) data_rescale=[] for scale in s: image_rescale = rescale(patch[0], scale) mask_rescale = rescale(patch[1], scale) s_r = mask_rescale.shape[0] q_h, r_h = divmod(256-s_r,2) if q_h > 0 : image_rescale = np.pad(image_rescale,(q_h, q_h+r_h), mode = "reflect") mask_rescale = np.pad(mask_rescale,(q_h, q_h+r_h), mode = "reflect") else : patches = extract_patch(image_rescale,mask_rescale, 256) i = np.random.randint(len(patches), size=1) image_rescale,mask_rescale = patches[i] mask_rescale = preprocessing.binarize(np.array(mask_rescale), threshold=0.001) data_rescale = [image_rescale, mask_rescale] return data_rescale
def do_transformations(self): # binarize counts if self.transform == 'binarize': print "Binarizing" self.feature_counts = binarize(self.feature_counts, copy=False) #self.feature_counts = sparse.csr_matrix(self.feature_counts > 0, dtype=int) elif self.transform == 'tfidf': print "Doing tf-idf transform" #doc_sums = self.feature_counts.sum(axis=1) #if np.min(doc_sums) == 0: # doc_sums[doc_sums == 0] = 1.0 #tf = sparse.csr_matrix(self.feature_counts.multiply(1.0/doc_sums)) n_items, n_features = self.feature_counts.shape tf = normalize(self.feature_counts, norm='l1', axis=1, copy=False) doc_counts = self.vocab.get_all_doc_counts() n_docs = doc_counts.max() # add one to avoid zeros which might screw up the matrix size idf = sparse.csr_matrix(np.log(float(n_docs+1) / doc_counts), dtype=float) print tf.shape, idf.shape self.feature_counts = tf.multiply(idf) assert self.feature_counts.shape == (n_items, n_features) elif self.transform == 'normalizel1' or self.transform == 'normalize': print "Normalizing rows" self.feature_counts = normalize(self.feature_counts, norm='l1', axis=1, copy=False) elif self.transform == 'normalizel2': print "Normalizing rows" self.feature_counts = normalize(self.feature_counts, norm='l2', axis=1, copy=False) if self.scale_factor is not None: self.feature_counts = self.feature_counts * self.scale_factor
def elastic_transform(image, gt, alpha, sigma, random_state=None): """ :param image: image :param gt: ground truth :param alpha: deformation coefficient (high alpha -> strong deformation) :param sigma: std of the gaussian filter. (high sigma -> smooth deformation) :param random_state: :return: deformation of the pair [image,mask] """ if random_state is None: random_state = np.random.RandomState(None) shape = image.shape d = 4 sub_shape = (shape[0]/d, shape[0]/d) deformations_x = random_state.rand(*sub_shape) * 2 - 1 deformations_y = random_state.rand(*sub_shape) * 2 - 1 deformations_x = np.repeat(np.repeat(deformations_x, d, axis=1), d, axis = 0) deformations_y = np.repeat(np.repeat(deformations_y, d, axis=1), d, axis = 0) dx = gaussian_filter(deformations_x, sigma, mode="constant", cval=0) * alpha dy = gaussian_filter(deformations_y, sigma, mode="constant", cval=0) * alpha x, y = np.meshgrid(np.arange(shape[0]), np.arange(shape[1])) indices = np.reshape(y+dy, (-1, 1)), np.reshape(x+dx, (-1, 1)) elastic_image = map_coordinates(image, indices, order=1).reshape(shape) elastic_gt = map_coordinates(gt, indices, order=1).reshape(shape) elastic_gt = preprocessing.binarize(np.array(elastic_gt), threshold=0.5) return [elastic_image, elastic_gt]
def getPredictions(image_data, threshold, allShipping=False): """ This function returns np arrays of true labels, predicted labels, and predicted probabilities. image_data: generated from Keras image generator, in batch format threshold: the probability at which a classification should be considered shipping (1) allShipping: whether all image_data has a true shipping classification (eg. for the PHMSA data that is assumed to all have shipping activity) """ all_true = np.zeros(0) all_pred = np.zeros(0) pred_prob = np.zeros(0) for i in range(len(image_data)): image_batch, label_batch = image_data[i] if (not allShipping): all_true = np.append(all_true, get_true_labels(label_batch)) y_pred_prob = model.predict_proba(image_batch)[:, 1] y_pred_class = binarize([y_pred_prob], threshold)[0] all_pred = np.append(all_pred, y_pred_class) pred_prob = np.append(pred_prob, y_pred_prob) if (allShipping): all_true = np.repeat(1, len(all_pred)) return all_true, all_pred, pred_prob
def roc_auc(y_true, y_pred, jump=0.01): ''' Area under ROC (Receiver Operating Characteristics) curve Parameters ---------- y_true: numpy.ndarray Targets y_pred: numpy.ndarray Class probability References ---------- .. [1] https://developers.google.com/machine-learning/crash-course/classification/roc-and-auc Returns ------- roc_auc_score: float ROC AUC score ''' y_true, y_pred = y_true.reshape(-1, 1), y_pred.reshape(-1, 1) x = [] y = [] for thr in np.arange(0.01, 1 + jump, jump): y_pred_bin = binarize(y_pred, thr) tn, fp, fn, tp = confusion_binary(y_true, y_pred_bin) tpr = tp / (tp + fn) fpr = fp / (tn + fp) y.append(tpr) x.append(fpr) x = np.array(x) y = np.array(y) return np.abs(np.trapz(y, x)) # Why trapz gives negative value?
def op_vs_ip(subid, image_types, imagepaths, op_direc, overlays): img_data_group=[] img_shape_group=[] ol_data_group=[] ol_shape_group=[] for i, path in enumerate(imagepaths): axial_slice, cor_slice, sag_slice, img_aspect_axial, img_aspect_cor, img_aspect_sag = pull_midslices(path) if os.path.isfile(overlays[i]): axial_slice_ol, cor_slice_ol, sag_slice_ol, img_aspect_axial_ol, img_aspect_cor_ol, img_aspect_sag_ol = pull_midslices(overlays[i]) ol_data_group.append([axial_slice_ol, cor_slice_ol, sag_slice_ol]) ol_shape_group.append([img_aspect_axial_ol, img_aspect_cor_ol, img_aspect_sag_ol]) else: ol_data_group.append(['null','null','null']) ol_shape_group.append(['null','null','null']) ## Append to Matrices img_data_group.append([axial_slice, cor_slice, sag_slice]) img_shape_group.append([img_aspect_axial,img_aspect_cor,img_aspect_sag]) my_cmap=plt.cm.gray fig, axarr = plt.subplots(ncols=np.shape(img_shape_group)[1], nrows=np.shape(img_shape_group)[0], figsize=(np.shape(img_shape_group)[0]*5,np.shape(img_shape_group)[1]*5)) plt.suptitle(subid+' File Comparison', fontsize=20) titlearray=['Axial', 'Coronal', 'Saggital'] for x in range(0,np.shape(img_shape_group)[0]): for y in range(0,np.shape(img_shape_group)[1]): im = axarr[x, y].imshow(img_data_group[x][y], cmap=my_cmap, aspect=img_shape_group[x][y]) axarr[x, y].set_xlabel('(Right) Radiological Convention (Left)', fontsize=10) axarr[x, y].set_title(image_types[x]+' '+titlearray[y]) #divider = make_axes_locatable(axarr[x, y]) #cax_ = divider.append_axes("right", size="5%", pad=0.05) #cbar = plt.colorbar(im, cax=cax_, ticks=MultipleLocator(round(np.max(img_data_group[x][y])/5, 1))) axarr[x, y].xaxis.set_visible(False) axarr[x, y].yaxis.set_visible(False) if os.path.isfile(overlays[x]): if x == 1: thresh=0.25 if x == 2: thresh=0.4 sl=np.array(ol_data_group[x][y]).astype(np.float64) sl=filters.sobel(sl) sl=preprocessing.binarize(sl, np.max(sl)*thresh) sl[sl < 1] = 'Nan' axarr[x, y].imshow(sl, cmap='autumn', aspect=ol_shape_group[x][y]) #plt.show() plt.tight_layout() plt.autoscale() plt.savefig(op_direc)
def load_data(self, features, X_threshold): """ Load data into c_data """ from neurosynth.analysis.reduce import average_within_regions # Load Masks by studies matrix # ADD FEATURE TO FILTER BY FEATURES masks_by_studies = average_within_regions(self.dataset, self.mask_img, threshold = self.thresh) study_ids = self.dataset.feature_table.data.index print "Loading data from neurosynth..." pb = tools.ProgressBar(len(list(masks_by_studies)), start=True) self.ids_by_masks = [] self.data_by_masks = [] for mask in masks_by_studies: m_ids = study_ids[np.where(mask == True)[0]] self.ids_by_masks.append(m_ids) self.data_by_masks.append(self.dataset.get_feature_data(ids=m_ids)) pb.next() self.mask_num = masks_by_studies.shape[0] self.mask_pairs = list(itertools.permutations(range(0, self.mask_num), 2)) filename = path.join(mkdtemp(), 'c_data.dat') self.c_data = np.memmap(filename, dtype='object', mode='w+', shape=(self.mask_num, self.mask_num)) # Load data for pair in self.mask_pairs: reg1_ids = self.ids_by_masks[pair[0]] reg2_ids = self.ids_by_masks[pair[1]] reg1_set = list(set(reg1_ids) - set(reg2_ids)) reg2_set = list(set(reg2_ids) - set(reg1_ids)) x1 = self.data_by_masks[pair[0]] x1 = np.array(x1)[np.where(np.in1d(reg1_ids, reg1_set))[0]] x2 = self.data_by_masks[pair[1]] x2 = np.array(x2)[np.where(np.in1d(reg2_ids, reg2_set))[0]] y = np.array([0]*len(reg1_set) + [1]*len(reg2_set)) X = np.vstack((x1, x2)) if X_threshold is not None: X = binarize(X, X_threshold) from neurosynth.analysis.classify import regularize X = regularize(X, method='scale') self.c_data[pair] = (X, y) if self.memsave: self.data_by_masks = [] self.ids_by_masks = []
def transform(self, X): """Compute the Jaccard similarity for all pairs of elements in ``X``. Rows i in ``X`` are assumed to represent pairs, where ``X[i, :n_features]`` and ``X[i, n_features:]`` correspond to their two individual elements, each representing a set. Calling ``transform`` computes the Jaccard similarity between these sets, i.e. such that ``Xt[i]`` is the Jaccard similarity of ``X[i, :n_features]`` and ``X[i, n_features:]``. Parameters ---------- :param X: array-like, shape (n_samples, n_features) Input data. Returns ------- :returns: Xt array-like, shape (n_samples, 1) The transformed data. """ n_samples, n_features_all = X.shape n_features = n_features_all // 2 X = binarize(X) X1 = X[:, :n_features] X2 = X[:, n_features:] sparse = sp.issparse(X) if sparse and not sp.isspmatrix_csr(X): X = X.tocsr() if sparse: if X.data.sum() == 0: return np.zeros((n_samples, 1)) numerator = np.asarray(X1.multiply(X2).sum(axis=1)).ravel() X_sum = X1 + X2 X_sum.data[X_sum.data != 0.] = 1 M = X_sum.sum(axis=1) A = M.getA() denominator = A.reshape(-1,) else: if len(X[X.nonzero()]) == 0.: return np.zeros((n_samples, 1)) numerator = (X1 * X2).sum(axis=1) X_sum = X1 + X2 X_sum[X_sum.nonzero()] = 1 denominator = X_sum.sum(axis=1) with np.errstate(divide="ignore", invalid="ignore"): Xt = numerator / denominator Xt[np.where(denominator == 0)[0]] = 0. return np.array(Xt).reshape(-1, 1)
def eval_model(preds, y_ref, probs = [], test_px = [], TBI = False): global GAMMA if len(preds) != len(y_ref): print "Predicted labels and test labels dont have the same dimensions!" print "Predicted: ", n_pred, "; Tests: ", n_test exit() if not TBI: CM = confusion_matrix(y_ref, preds) TP = CM[1,1] TN = CM[0,0] FP = CM[0,1] FN = CM[1,0] ACC = (TP+TN)/float(TP+TN+FP+FN) SENS = TP/float(TP+FN) SPEC = TN/float(TN+FP) return ACC,SENS,SPEC else: i = np.arange(len(test_px)) df = pd.DataFrame({"Recording": pd.Series(test_px,index = i), "Prediction": pd.Series(preds,index = i), "Reference": pd.Series(y_ref,index = i), "Probabilities": pd.Series(probs,index = i) }).sort_values(by="Recording") y_test_rec = [] TBI_list = [] for name, group in df.groupby("Recording"): l = group.Reference.iloc[0] y_test_rec.append(l) TB_prob = sum(group.Probabilities.values) / float(len(group.Probabilities)) TBI_list.append(TB_prob) diagnosis_list = map(int,binarize(np.array(TBI_list).reshape(1,-1),threshold = GAMMA)[0]) CM = confusion_matrix(y_test_rec, diagnosis_list) TP = CM[1,1] TN = CM[0,0] FP = CM[0,1] FN = CM[1,0] ACC = (TP+TN)/float(TP+TN+FP+FN) SENS = TP/float(TP+FN) SPEC = TN/float(TN+FP) return ACC, SENS, SPEC
def predictClass(self, threshold=0.5): # prediction # self.pred_y = self.model.predict(self.test_set_X) if self.is_keras: self.pred_y_prob = self.model.predict_proba(self.test_set_X)[:, 0] else: self.pred_y_prob = self.model.predict_proba(self.test_set_X)[:, 1] self.pred_y = binarize(self.pred_y_prob.reshape(1, -1), threshold)[0].astype(int)
def prob_maximum_low(self, x=None, y=None, show=True): x, y = self.proxy_xy(x, y) fiter = self.get_fiter() y_prob = MlFiterExcute.run_prob_cv_estimator(fiter, x, y, n_folds=10) l_pb = y_prob[y_prob < y_prob.mean()].mean() y_prob_l = binarize(y_prob.reshape(-1, 1), l_pb) if show: self.scores(y_prob_l, y) return l_pb
def binarize(df): """ 将数据二值化 :param df: 传入DataFrame :returns: 标准化后的数据 """ if not isinstance(df, pd.DataFrame): raise Exception("df is not DataFrame!") return preprocessing.binarize(df)
def transform(self, X): #If the binarize option is set to true, we need now to recompute "f", our binarized word counter if(self.bina == True): f_hat = binarize(X, threshold = 0.0) else : f_hat = X f_tilde = f_hat.multiply(self.r) return f_tilde
def validation(data, px, y_px): global GAMMA # convert px and y_px for broadcasting px = np.array(px) y_px = np.array(y_px) # vanilla LogReg classifier LRM = LogisticRegression() skf = StratifiedKFold(y_px, n_folds = N_FOLDS, shuffle = True) print "Running",N_FOLDS,"Stratified Splits" probs = [] # Probabilities during validation preds = [] # Predictions made y_ref = [] # Labels as they were used in validation val_recs = [] # List of recordings as they were used in validation for train_idx, val_idx in skf: # Separate train and val sets using indexes X_train, y_train, X_val, y_val, val_px = leave_out_fold(data, px, train_idx, val_idx) # Train the LRM LRM.fit(X_train, y_train) # Save this LRM performance probs.extend(list(LRM.predict_proba(X_val)[:,1])) preds.extend(list(LRM.predict(X_val))) y_ref.extend(y_val) val_recs.extend(val_px) fpr, tpr, thresholds = roc_curve(y_ref, probs) """ Do ROC analysis and get optimal threshold for sens ~= spec """ i = np.arange(len(tpr)) roc = pd.DataFrame({'fpr' : pd.Series(fpr, index=i), 'tpr' : pd.Series(tpr, index = i), '1-fpr' : pd.Series(1-fpr, index = i), 'tf' : pd.Series(tpr - (1-fpr), index = i), 'thresholds' : pd.Series(thresholds, index = i) }) idx = (roc.tf).abs().argmin() thresh = roc.thresholds.iloc[idx] auc_acc = auc(fpr,tpr) # Perform classification with optimal threshold preds_opt = map(int, binarize(np.array(probs).reshape(1,-1), threshold=thresh)[0]) GAMMA = thresh ACC, SENS, SPEC = eval_model(preds_opt, y_ref) return [ACC,SENS,SPEC,auc_acc]
def test_model(LRM, test_data, TBI = 0, save = 0): global GAMMA """ Evaluate a trained Logistic Regression model Inputs: ======= LRM: Trained Logistic Regression Model test_data: Data to test the LRM on return: Flag - To return [spec, sens, acc] or just acc TBI: Flag - To compute results using TBI or not """ # Get the labels y = test_data.TBResult.values # Get the names of the recordings in the test set test_recs = test_data.StudyNum.values # Keep the feature data for training X = test_data.drop(["StudyNum","TBResult"], axis = 1) probs = LRM.predict_proba(X)[:,1] """ Calculate AUC acc using ROC analysis """ # Get FPR and TPR for the test set fpr, tpr, thresh = roc_curve(y,probs) # Calc AUC acc auc_acc = auc(fpr,tpr) pred = map(int,binarize(np.array(probs).reshape(1,-1),threshold = GAMMA)[0]) # Do the same thing but with pandas # i = np.arange(len(probs)) # temp_df = pd.DataFrame({'StudyNum': pd.Series(test_recs, index = i), # 'Probability': pd.Series(probs, index = i), # 'TBResult': pd.Series(y, index = i), # 'Pred': 0}) # # This makes all predictions = 1 where Prob >= Gamma # temp_df.ix[temp_df.Probability >= GAMMA,'Pred'] = 1 # pred = temp_df.Pred.values if TBI == 0: acc,sens,spec = eval_model(pred, y) return [acc, sens, spec, auc_acc] else: acc,sens,spec = eval_model(pred, y, probs = probs, test_recs = test_recs, TBI = 1, save = save) return [acc, sens, spec]
def getSrlRepresentation(cas, intensity=False, log=False, bnrz=False, representationSize=200): from sklearn.preprocessing import binarize model = models.Word2Vec.load('models/word2vec/srlModel') ret = [None]*len(cas.sentences) for i, sentence in enumerate(cas.srlSentences): numRows = sum([len(clause) for clause in sentence]) altSentence = np.zeros((numRows, representationSize)) currentRow = 0 for clause in sentence: for j, (role, text) in enumerate(clause.iteritems()): word = str((role, text)) try: altWord = np.multiply(np.add(np.divide(model[word], 2.0), 0.5), 255) if intensity else model[word] altWord = np.multiply(binarize(altWord, threshold=255.0/2.0), 255) if bnrz and intensity else altWord altWord = binarize(altWord) if bnrz and not intensity else altWord altSentence[currentRow,:] = altWord except: altSentence[currentRow,:] = altSentence[j-1,:] if j != 0 else np.zeros(representationSize) currentRow += 1 ret[i] = altSentence return ret
def load_data(self, features, X_threshold): """ Load data into c_data """ # Load data for each mask self.load_mask_data(features) filename = path.join(mkdtemp(), 'c_data.dat') self.c_data = np.memmap(filename, dtype='object', mode='w+', shape=(self.mask_num)) all_ids = self.dataset.image_table.ids # If a low thresh is set, then get ids for studies at that threshold if self.thresh_low is not None: ids_by_masks_low = [] from neurosynth.analysis.reduce import average_within_regions masks_by_studies_low = average_within_regions( self.dataset, self.mask_img, threshold=self.thresh_low) for mask in masks_by_studies_low: m_ids = np.array(all_ids)[np.where(mask == True)[0]] ids_by_masks_low.append(m_ids) # Set up data into c_data for num, on_ids in enumerate(self.ids_by_masks): # If a low threshold is set, then use that to filter "off_ids", otherwise use "on_ids" if self.thresh_low is not None: off_ids = list(set(all_ids) - set(ids_by_masks_low[num])) else: off_ids = list(set(all_ids) - set(on_ids)) on_data = self.data_by_masks[num].dropna() off_data = self.dataset.get_feature_data(ids=off_ids).dropna() y = np.array([0] * off_data.shape[0] + [1] * on_data.shape[0]) X = np.vstack((np.array(off_data), np.array(on_data))) from neurosynth.analysis.classify import regularize X = regularize(X, method='scale') if X_threshold is not None: X = binarize(X, X_threshold) self.c_data[num] = (X, y) if self.memsave: self.data_by_masks = [] self.ids_by_masks = [] self.comparisons = range(0, self.mask_num) self.comp_dims = (self.mask_num, )
def modelEval(name, model, X, y, binarize_threshold): X_train, X_test, y_train, y_test = train_test_split(X_kbest, y, test_size=0.2, stratify = y, random_state = rs) meancvscore = cross_val_score(model, X, y, n_jobs=-1, verbose=1).mean() print 'Model %s cross_val_score: %f' % (name, meancvscore) model.fit(X_train, y_train) y_pred = model.predict(X_test) y_pred_adj = binarize(model.predict_proba(X_test)[:,1], threshold = binarize_threshold, copy=False).transpose() print 'Model %s classification metrics:' % name doClassifMetrics(y_test, y_pred) print 'Model %s using prediction threshold %f:' % (name, binarize_threshold) doClassifMetrics(y_test, y_pred_adj)
def test_preprocessing_assignment(self): iris = datasets.load_iris() df = pdml.ModelFrame(iris) original_columns = df.data.columns df['sepal length (cm)'] = df['sepal length (cm)'].preprocessing.binarize(threshold=6) self.assertTrue(isinstance(df, pdml.ModelFrame)) binarized = pp.binarize(np.atleast_2d(iris.data[:, 0]), threshold=6) expected = np.hstack([binarized.T, iris.data[:, 1:]]) self.assert_numpy_array_almost_equal(df.data.values, expected) self.assert_index_equal(df.data.columns, original_columns) # recreate data iris = datasets.load_iris() df = pdml.ModelFrame(iris) target_columns = ['sepal length (cm)', 'sepal width (cm)'] df[target_columns] = df[target_columns].preprocessing.binarize(threshold=6) self.assertTrue(isinstance(df, pdml.ModelFrame)) binarized = pp.binarize(iris.data[:, 0:2], threshold=6) expected = np.hstack([binarized, iris.data[:, 2:]]) self.assert_numpy_array_almost_equal(df.data.values, expected) self.assert_index_equal(df.data.columns, original_columns)
def getFeaturesUnigrams(sentence): def normalizeFeatures(values, mn, mx): return np.divide(np.subtract(values, mn), float(mx-mn)) featureDict = {} for i, word in enumerate(sentence.split()): try: representation = model[word] representation = binarize(representation) representation = normalizeFeatures(representation, 0, 1) for j, vectorEntry in enumerate(representation): featureDict[str(i*len(representation)+j)] = vectorEntry except KeyError: continue return featureDict
def load_data(self, features, X_threshold): """ Load data into c_data """ # Load data for each mask self.load_mask_data(features) # Set up pair-wise data self.comparisons = list( itertools.combinations(range(0, self.mask_num), 2)) filename = path.join(mkdtemp(), 'c_data.dat') self.c_data = np.memmap(filename, dtype='object', mode='w+', shape=(self.mask_num, self.mask_num)) # Filter data and arrange into c_data for pair in self.comparisons: x1 = self.data_by_masks[pair[0]] x2 = self.data_by_masks[pair[1]] reg1_ids = self.ids_by_masks[pair[0]] reg2_ids = self.ids_by_masks[pair[1]] if self.remove_overlap is True: reg1_set = list(set(reg1_ids) - set(reg2_ids)) reg2_set = list(set(reg2_ids) - set(reg1_ids)) x1 = np.array(x1)[np.where(np.in1d(reg1_ids, reg1_set))[0]] x2 = np.array(x2)[np.where(np.in1d(reg2_ids, reg2_set))[0]] reg1_ids = reg1_set reg2_ids = reg2_set y = np.array([0] * len(reg1_ids) + [1] * len(reg2_ids)) X = np.vstack((x1, x2)) if X_threshold is not None: X = binarize(X, X_threshold) from neurosynth.analysis.classify import regularize X = regularize(X, method='scale') self.c_data[pair] = (X, y) if self.memsave: self.data_by_masks = [] self.ids_by_masks = [] self.comp_dims = (self.mask_num, self.mask_num)
def get_score(X, y, clf, scoring = 'accuracy'): from sklearn.preprocessing import binarize prediction = binarize(clf.predict(X), 0.5) if scoring == 'accuracy': from sklearn.metrics import accuracy_score score = accuracy_score(y, prediction) elif scoring =='f1': from sklearn.metrics import f1_score score = f1_score(y, prediction) else: score = scoring(y, prediction) return prediction, score
def getFeaturesBigrams(sentence): def normalizeFeatures(values, mn, mx): return np.divide(np.subtract(values, mn), float(mx-mn)) featureDict = {} sentence = sentence.split() bigramSentence = [b for b in zip(sentence[:-1], sentence[1:])] for i, (w1, w2) in enumerate(bigramSentence): try: representation = model[w1 + '_' + w2] representation = binarize(representation) representation = normalizeFeatures(representation, 0, 1) for j, vectorEntry in enumerate(representation): featureDict[str(i*len(representation)+j)] = vectorEntry except KeyError: continue return featureDict
def read_train(train_file): lines = [] y = [] vectorizer = CountVectorizer(min_df=3) tf_idf = TfidfTransformer() for parts in utils.read_train(train_file): is_blocked = parts[8] desc = cleantext.clean(parts[4], False) lines.append(desc) y.append(int(is_blocked)) vectorizer = vectorizer.fit_transform(lines) X_nb = tf_idf.fit_transform(vectorizer) X_log = binarize(vectorizer) return X_nb, X_log, numpy.asarray(y)