def decision_path(self, X): """Return the decision path in the forest .. versionadded:: 0.18 Parameters ---------- X : array-like or sparse matrix, shape = [n_samples, n_features] The input samples. Internally, its dtype will be converted to ``dtype=np.float32``. If a sparse matrix is provided, it will be converted into a sparse ``csr_matrix``. Returns ------- indicator : sparse csr array, shape = [n_samples, n_nodes] Return a node indicator matrix where non zero elements indicates that the samples goes through the nodes. n_nodes_ptr : array of size (n_estimators + 1, ) The columns from indicator[n_nodes_ptr[i]:n_nodes_ptr[i+1]] gives the indicator value for the i-th estimator. """ X = self._validate_X_predict(X) indicators = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, backend="threading")( delayed(parallel_helper)(tree, 'decision_path', X, check_input=False) for tree in self.estimators_) n_nodes = [0] n_nodes.extend([i.shape[1] for i in indicators]) n_nodes_ptr = np.array(n_nodes).cumsum() return sparse_hstack(indicators).tocsr(), n_nodes_ptr/fit
def decision_path(self, X): """ Return the decision path in the forest. Parameters ---------- X : array-like of shape (n_samples, n_features) The input samples. Internally, it will be converted to ``dtype=np.float64``. Returns ------- indicator : sparse matrix of shape (n_samples, n_nodes) Return a node indicator matrix where non zero elements indicates that the samples goes through the nodes. The matrix is of CSR format. n_nodes_ptr : ndarray of shape (n_estimators + 1,) The columns from indicator[n_nodes_ptr[i]:n_nodes_ptr[i+1]] gives the indicator value for the i-th estimator. """ X = self._validate_X_predict(X) indicators = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, backend='threading')( delayed(tree.decision_path)(X, check_input=False) for tree in self.estimators_) n_nodes = [0] n_nodes.extend([i.shape[1] for i in indicators]) n_nodes_ptr = np.array(n_nodes).cumsum() return sparse_hstack(indicators).tocsr(), n_nodes_ptr
def precompute_or_load_feats(data, feats_id_prefix, args): # *** all *** means all except n-gram feats all_feats_names = [] all_feats_list = [] text_feats_names = [] text_feats_matrix = None grams_finished = False slen_finished = False argssplitlist = [x.strip() for x in args.feats.split(",")] PATH_PREDICTION_FEATS = os.path.join(args.data_path, "mbti_enne_pred.csv") for feats_type in argssplitlist: if "gram" in feats_type and not grams_finished: feats_filename = os.path.join(feats_id_prefix, "feats.pickle") vocab_filename = os.path.join(feats_id_prefix, "vocab.pickle") try: print("Loading precomputed features from disk ...") gram_feats = pickle.load(open(feats_filename, "rb")) gram_feat_names = pickle.load(open(vocab_filename, "rb")) except: raise Exception("Something went wrong when loading the n-gram features files ...") text_feats_names = gram_feat_names text_feats_matrix = gram_feats grams_finished = True PandoraAttGen.feat_names = gram_feat_names if feats_type == "mbtipred": mbti_df = pd.read_csv(PATH_PREDICTION_FEATS) joined = data[["author"]].merge(mbti_df, on="author", how="left") mbti_feat_names = ["introverted_pred", "intuitive_pred", "thinking_pred", "perceiving_pred"] mbti_feats = csr_matrix(np.array(joined[mbti_feat_names])) all_feats_names += mbti_feat_names all_feats_list.append(mbti_feats) if feats_type == "ennepred": enne_df = pd.read_csv(PATH_PREDICTION_FEATS) joined = data[["author"]].merge(enne_df, on="author", how="left") enne_feat_names = ["pred_e_type_" + str(n) for n in range(1, 10)] enne_feats = csr_matrix(np.array(joined[enne_feat_names])) all_feats_names += enne_feat_names all_feats_list.append(enne_feats) all_feats_matrix = csr_matrix(sparse_hstack(all_feats_list)) if len(all_feats_names) > 0 else None if all_feats_matrix is not None: assert all_feats_matrix.shape[1] == len(all_feats_names) print(all_feats_matrix.shape) if text_feats_matrix is not None: assert text_feats_matrix.shape[1] == len(text_feats_names) print(text_feats_matrix.shape) return (text_feats_matrix, text_feats_names, all_feats_matrix, all_feats_names)
def _decision_path(isolation_forest, X, n_jobs): # code from sklearn RandomForest. X = check_array(X, dtype=DTYPE, accept_sparse='csr') indicators = Parallel(n_jobs=n_jobs, **_joblib_parallel_args(prefer='threads'))( delayed(parallel_helper)( tree, 'decision_path', X, check_input=False) for tree in isolation_forest.estimators_) n_nodes = [0] n_nodes.extend([i.shape[1] for i in indicators]) n_nodes_ptr = np.array(n_nodes).cumsum() indicators = sparse_hstack(indicators).tocsr() return indicators, n_nodes_ptr
def transform(self, X): bow = self.vectorizer.transform(X) if self.pca: bow = self.svd.transform(bow) # print(bow.shape[1], end='') mean_emb = self.mev_transform(X) # print(mean_emb.shape) if self.pca: # use np.hstack for numpy array combined_emb = np_hstack((bow, mean_emb)) else: # use scipy.sparse.hstack for sparse array combined_emb = sparse_hstack((bow, mean_emb)) return combined_emb
def restoreMaskedBins(self): """ Puts backs into the matrix the bins removed """ if len(self.orig_bin_ids) == 0: return # the rows to add are # as an empty sparse matrix M = self.matrix.shape[0] N = len(self.orig_bin_ids) - M rows_mat = csr_matrix((N, M)) # cols to add cols_mat = csr_matrix((M + N, N)) # add the rows and cols at the end of the # current matrix self.matrix = sparse_vstack([self.matrix, rows_mat]) self.matrix = sparse_hstack([self.matrix, cols_mat], format='csr') # the new matrix has the right number of cols and rows, now # they need to be reordered to be back in their original places rows = cols = np.argsort(self.orig_bin_ids) self.matrix = self.matrix[rows, :][:, cols] self.cut_intervals = [self.orig_cut_intervals[x] for x in rows] self.interval_trees, self.chrBinBoundaries = \ self.intervalListToIntervalTree(self.cut_intervals) # set as nan_bins the masked bins that were restored self.nan_bins = self.orig_bin_ids[M:] if self.correction_factors is not None: # add missing values as nans at end of array self.correction_factors = np.concatenate( [self.correction_factors, np.repeat(np.nan, N)]) # reorder array self.correction_factors = self.correction_factors[rows] # reset orig bins ids and cut intervals self.orig_bin_ids = [] self.orig_cut_intervals = [] log.info("masked bins were restored\n")
def _generate_feats(self, data, mode): # lexical feats #if mode == "train": # self.tfidf_vect = TfidfVectorizer(ngram_range = self.ngram_rng, min_df = self.min_df, use_idf = self.use_idf) # self.tfidf_vect.fit([x[1:-1] for x in list(data.text)]) # the x[1:-1] strips the initial and final [ and ] from the texts #feats = self.tfidf_vect.transform([x[1:-1] for x in list(data.text)]) feats = self.transformer_model.encode( [x[1:-1] for x in list(data.text)]) feats = np.array(feats) if self.use_utterance_feats: # utterance feats ut_feats = np.zeros((data.shape[0], 3)) current_mid = data.iloc[0, 8] current_max_timestamp = max( data[data.meeting_id == current_mid].timestamp) for i in range(data.shape[0]): text = data.iloc[i, 2][1:-1] timestamp = data.iloc[i, 1] next_timestamp = data.iloc[i + 1, 1] if ( i + 1 < data.shape[0] and data.iloc[i + 1, 8] == data.iloc[i, 8]) else None # first condition is for the end of the data frame (last utterance of the last meeting) second is on a breaking point between two meetings (happens for last utterance of every meeting) # without the second we would get 1853.2 as the last timestamp of meeting X and 0.0 as the first in meeting Y and the difference would be negative which messes up things down the line ut_feats[i, 0] = len(text.split(" ")) # length in words ut_feats[ i, 1] = next_timestamp - timestamp if next_timestamp is not None else 2.0 # 2.0 is just an arbitray approximate value for the duration of the last utterance of each meeting ut_feats[i, 2] = timestamp / current_max_timestamp if next_timestamp is None and i + 1 < data.shape[ 0]: # this is a breaking point between meetings and we have to update some of the vals for the next iteration current_mid = data.iloc[i + 1, 8] current_max_timestamp = max( data[data.meeting_id == current_mid].timestamp) feats = csr_matrix(sparse_hstack([feats, csr_matrix(ut_feats)])) # expand all utterance level feats to include feats of the prev and next utterances prev_context_feat_mats, next_context_feat_mats = [], [] # prev context for offset in range(1, self.prev_context_len + 1): context_feats = feats[:-offset, :] padding = csr_matrix(np.zeros((offset, feats.shape[1]))) final = sparse_vstack((padding, context_feats)) prev_context_feat_mats.append(final) # next context for offset in range(1, self.next_context_len + 1): context_feats = feats[offset:, :] padding = csr_matrix(np.zeros((offset, feats.shape[1]))) final = sparse_vstack((context_feats, padding)) next_context_feat_mats.append(final) #feats = sparse_hstack([feats] + prev_context_feat_mats + next_context_feat_mats) if self.do_scaling: if mode == "train": self.scaler = StandardScaler(with_mean=False) self.scaler.fit(feats) feats = self.scaler.transform(feats) return feats
def transform(self, X): bow = self.vectorizer.transform(X) d2v_emb = self.d2v_transform(X) combined_emb = sparse_hstack((bow, d2v_emb)) return combined_emb
def generate_setup(data, data_feats, label_type, feat_names, args, extra_feats, extra_feat_names, label_name, feat_size=None, hp=None, fold_in=None): # important return stuf xval_res = None tfidf = None fs = None cw = "balanced" # "balanced" or None # reg_types = ["l1","l2"] reg_types = [ "l2"] # preliminary experiments show that l2 + relatively low number of features in feat sel perform the best, but this might not always be the case n_base_feats = data_feats.shape[1] if data_feats is not None else 0 n_extra_feats = extra_feats.shape[1] if extra_feats is not None else 0 total_n_feats = n_base_feats + n_extra_feats max_features = 20000 feat_sel_Ncandidates = [int(percentage * total_n_feats) for percentage in [0.05, 0.1, 0.2, 0.3, 0.4, 0.5]] if total_n_feats < max_features: feat_sel_Ncandidates += ["all"] # if there are not a lot of feats also try a variant with all feats else: # on the other hand, dont try more than 20k feats (more than that didn't appear to yield significant benefits in prelim. experiments) feat_sel_Ncandidates = [x for x in feat_sel_Ncandidates if x <= max_features] + [max_features] if feat_size: feat_sel_Ncandidates = [feat_size] if args.model == "lr": hyperparams = [tuple([2 ** i]) for i in range(-10, 5)] # regularisation strength for the regression models elif args.model == "et": n_estimators = [100, 200, 300, 400, 500] mf = ["auto", 800] if total_n_feats >= 800 else ["auto"] bootstrap = [True] oob = [True, False] hyperparams = list(np.itertools.product(*[n_estimators, mf, bootstrap, oob])) feat_sel_Ncandidates = ["all"] if total_n_feats < max_features else [max_features] elif args.model == "dummy": hyperparams = [0] else: raise Exception("Unknown model type: " + str(args.model)) if hp: hyperparams = [(hp,)] valid_indexes = data['label'].notnull() # filter out rows with nan vals for extra feats if extra_feats is not None: ll = np.isnan(extra_feats.todense()).any(axis=1) for ind in range(len(ll)): if ll[ind]: # there is a nan in that row valid_indexes[ind] = False print("Threw out " + str(np.sum(ll))) valid_indexes_numbers = np.where(valid_indexes)[0] filtered_data = data[valid_indexes] if data_feats is not None: filtered_data_feats = data_feats[valid_indexes_numbers, :] if extra_feats is not None: filtered_extra_feats = extra_feats[valid_indexes_numbers, :] folds_to_run = [0, 1, 2, 3, 4] if args.specificfold == -1 else [args.specificfold] if fold_in: folds_to_run = [fold_in] for fold in folds_to_run: # print("Starting fold " + str(fold)) test_fold = fold val_fold = (fold + 1) % 5 train_indexes = (filtered_data['fold'] != test_fold) & (filtered_data['fold'] != val_fold) train_indexes_numbers = np.where(train_indexes)[0] if data_feats is not None: train_feats = filtered_data_feats[train_indexes_numbers] if extra_feats is not None: train_extra_feats = filtered_extra_feats[train_indexes_numbers] train_labels = filtered_data[train_indexes]["label"] # apply tfidf weighting if data_feats is not None: # print("Applying tfidf for this fold.") tfidf = TfidfTransformer(sublinear_tf=True) train_feats = tfidf.fit_transform(train_feats) train_unames = list(filtered_data[train_indexes]["author"]) # some fixes on the extra feats part if extra_feats is not None: # scaler = StandardScaler(with_mean = False) scaler = MinMaxScaler() train_extra_feats = csr_matrix(scaler.fit_transform(train_extra_feats.todense())) # combine word feats with all the other feats if data_feats is not None and extra_feats is None: combined_train_feats = train_feats elif data_feats is None and extra_feats is not None: combined_train_feats = csr_matrix(train_extra_feats) elif data_feats is not None and extra_feats is not None: for i in range(train_extra_feats.shape[0]): if np.isnan(train_extra_feats.todense()[i, :]).any(): print("NAN FOUND FOR USER :"******"You must supply at least one type of features to use!") # run the many loops for testing various versions of this and that for feats_N in feat_sel_Ncandidates: fs = SelectKBest(chi2, k=feats_N) if label_type == "classification" else SelectKBest(f_regression, k=feats_N) if (feats_N == 0): continue train_feats_FS = csr_matrix(fs.fit_transform(combined_train_feats, train_labels)) def eval_hp(hype, r, l, c, ar, trf, trl): model = PandoraAttGen.spawn_model(hype, r, l, c, ar) model.fit(trf, trl) # print("Finished for " + str(hype)) return model for reg in reg_types: train_feats_FS.sort_indices() xval_res = Parallel(n_jobs=12)( delayed(eval_hp)(h, reg, label_type, cw, args, train_feats_FS, train_labels) for h in hyperparams) print("*") print("*") print("*") print("*") return xval_res[0], fs, tfidf
def restoreMaskedBins(self): """ Puts backs into the matrix the bins removed Examples -------- >>> from scipy.sparse import coo_matrix >>> row, col = np.triu_indices(5) >>> cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1), ... ('a', 20, 30, 1), ('a', 30, 40, 1), ('b', 40, 50, 1)] >>> hic = hiCMatrix() >>> hic.nan_bins = [] >>> matrix = np.array([ ... [ 0, 10, 5, 3, 0], ... [ 0, 0, 15, 5, 1], ... [ 0, 0, 0, 7, 3], ... [ 0, 0, 0, 0, 1], ... [ 0, 0, 0, 0, 0]], dtype=np.int32) make the matrix symmetric: >>> hic.matrix = csr_matrix(matrix + matrix.T) >>> hic.setMatrix(csr_matrix(matrix + matrix.T), cut_intervals) Add masked bins masked bins >>> hic.maskBins([3]) >>> hic.matrix.todense() matrix([[ 0, 10, 5, 0], [10, 0, 15, 1], [ 5, 15, 0, 3], [ 0, 1, 3, 0]], dtype=int32) >>> hic.cut_intervals [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('b', 40, 50, 1)] >>> hic.restoreMaskedBins() >>> hic.matrix.todense() matrix([[ 0., 10., 5., 0., 0.], [10., 0., 15., 0., 1.], [ 5., 15., 0., 0., 3.], [ 0., 0., 0., 0., 0.], [ 0., 1., 3., 0., 0.]]) >>> hic.cut_intervals [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('a', 30, 40, 1), ('b', 40, 50, 1)] """ if len(self.orig_bin_ids) == 0: return # the rows to add are # as an empty sparse matrix M = self.matrix.shape[0] N = len(self.orig_bin_ids) - M rows_mat = csr_matrix((N, M)) # cols to add cols_mat = csr_matrix((M + N, N)) # add the rows and cols at the end of the # current matrix self.matrix = sparse_vstack([self.matrix, rows_mat]) self.matrix = sparse_hstack([self.matrix, cols_mat], format='csr') # the new matrix has the right number of cols and rows, now # they need to be reordered to be back in their original places rows = cols = np.argsort(self.orig_bin_ids) self.matrix = self.matrix[rows, :][:, cols] self.cut_intervals = [self.orig_cut_intervals[x] for x in rows] self.interval_trees, self.chrBinBoundaries = \ self.intervalListToIntervalTree(self.cut_intervals) # set as nan_bins the masked bins that were restored self.nan_bins = self.orig_bin_ids[M:] if self.correction_factors is not None: # add missing values as nans at end of array self.correction_factors = np.concatenate( [self.correction_factors, np.repeat(np.nan, N)]) # reorder array self.correction_factors = self.correction_factors[rows] # reset orig bins ids and cut intervals self.orig_bin_ids = [] self.orig_cut_intervals = [] log.info("masked bins were restored\n")