def decision_path(self, X):
        """Return the decision path in the forest

        .. versionadded:: 0.18

        Parameters
        ----------
        X : array-like or sparse matrix, shape = [n_samples, n_features]
            The input samples. Internally, its dtype will be converted to
            ``dtype=np.float32``. If a sparse matrix is provided, it will be
            converted into a sparse ``csr_matrix``.

        Returns
        -------
        indicator : sparse csr array, shape = [n_samples, n_nodes]
            Return a node indicator matrix where non zero elements
            indicates that the samples goes through the nodes.

        n_nodes_ptr : array of size (n_estimators + 1, )
            The columns from indicator[n_nodes_ptr[i]:n_nodes_ptr[i+1]]
            gives the indicator value for the i-th estimator.

        """
        X = self._validate_X_predict(X)
        indicators = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
                              backend="threading")(
            delayed(parallel_helper)(tree, 'decision_path', X,
                                      check_input=False)
            for tree in self.estimators_)

        n_nodes = [0]
        n_nodes.extend([i.shape[1] for i in indicators])
        n_nodes_ptr = np.array(n_nodes).cumsum()

        return sparse_hstack(indicators).tocsr(), n_nodes_ptr/fit
Example #2
0
    def decision_path(self, X):
        """
        Return the decision path in the forest.

        Parameters
        ----------
        X : array-like of shape (n_samples, n_features)
            The input samples. Internally, it will be converted to
            ``dtype=np.float64``.

        Returns
        -------
        indicator : sparse matrix of shape (n_samples, n_nodes)
            Return a node indicator matrix where non zero elements indicates
            that the samples goes through the nodes. The matrix is of CSR
            format.
        n_nodes_ptr : ndarray of shape (n_estimators + 1,)
            The columns from indicator[n_nodes_ptr[i]:n_nodes_ptr[i+1]]
            gives the indicator value for the i-th estimator.
        """
        X = self._validate_X_predict(X)
        indicators = Parallel(n_jobs=self.n_jobs, verbose=self.verbose, backend='threading')(
            delayed(tree.decision_path)(X, check_input=False)
            for tree in self.estimators_)

        n_nodes = [0]
        n_nodes.extend([i.shape[1] for i in indicators])
        n_nodes_ptr = np.array(n_nodes).cumsum()

        return sparse_hstack(indicators).tocsr(), n_nodes_ptr
    def precompute_or_load_feats(data, feats_id_prefix, args):
        # *** all *** means all except n-gram feats
        all_feats_names = []
        all_feats_list = []

        text_feats_names = []
        text_feats_matrix = None
        grams_finished = False
        slen_finished = False
        argssplitlist = [x.strip() for x in args.feats.split(",")]
        PATH_PREDICTION_FEATS = os.path.join(args.data_path, "mbti_enne_pred.csv")

        for feats_type in argssplitlist:
            if "gram" in feats_type and not grams_finished:

                feats_filename = os.path.join(feats_id_prefix, "feats.pickle")
                vocab_filename = os.path.join(feats_id_prefix, "vocab.pickle")
                try:
                    print("Loading precomputed features from disk ...")
                    gram_feats = pickle.load(open(feats_filename, "rb"))
                    gram_feat_names = pickle.load(open(vocab_filename, "rb"))
                except:
                    raise Exception("Something went wrong when loading the n-gram features files ...")

                text_feats_names = gram_feat_names
                text_feats_matrix = gram_feats
                grams_finished = True

                PandoraAttGen.feat_names = gram_feat_names

            if feats_type == "mbtipred":
                mbti_df = pd.read_csv(PATH_PREDICTION_FEATS)
                joined = data[["author"]].merge(mbti_df, on="author", how="left")
                mbti_feat_names = ["introverted_pred", "intuitive_pred", "thinking_pred", "perceiving_pred"]
                mbti_feats = csr_matrix(np.array(joined[mbti_feat_names]))
                all_feats_names += mbti_feat_names
                all_feats_list.append(mbti_feats)

            if feats_type == "ennepred":
                enne_df = pd.read_csv(PATH_PREDICTION_FEATS)
                joined = data[["author"]].merge(enne_df, on="author", how="left")
                enne_feat_names = ["pred_e_type_" + str(n) for n in range(1, 10)]
                enne_feats = csr_matrix(np.array(joined[enne_feat_names]))
                all_feats_names += enne_feat_names
                all_feats_list.append(enne_feats)

        all_feats_matrix = csr_matrix(sparse_hstack(all_feats_list)) if len(all_feats_names) > 0 else None
        if all_feats_matrix is not None:
            assert all_feats_matrix.shape[1] == len(all_feats_names)
            print(all_feats_matrix.shape)
        if text_feats_matrix is not None:
            assert text_feats_matrix.shape[1] == len(text_feats_names)
            print(text_feats_matrix.shape)
        return (text_feats_matrix, text_feats_names, all_feats_matrix, all_feats_names)
Example #4
0
def _decision_path(isolation_forest, X, n_jobs):
    # code from sklearn RandomForest.
    X = check_array(X, dtype=DTYPE, accept_sparse='csr')
    indicators = Parallel(n_jobs=n_jobs,
                          **_joblib_parallel_args(prefer='threads'))(
                              delayed(parallel_helper)(
                                  tree, 'decision_path', X, check_input=False)
                              for tree in isolation_forest.estimators_)
    n_nodes = [0]
    n_nodes.extend([i.shape[1] for i in indicators])
    n_nodes_ptr = np.array(n_nodes).cumsum()
    indicators = sparse_hstack(indicators).tocsr()
    return indicators, n_nodes_ptr
    def transform(self, X):
        bow = self.vectorizer.transform(X)
        if self.pca:
            bow = self.svd.transform(bow)

        # print(bow.shape[1], end='')
        mean_emb = self.mev_transform(X)
        # print(mean_emb.shape)
        if self.pca:  # use np.hstack for numpy array
            combined_emb = np_hstack((bow, mean_emb))
        else:  # use scipy.sparse.hstack for sparse array
            combined_emb = sparse_hstack((bow, mean_emb))
        return combined_emb
Example #6
0
    def restoreMaskedBins(self):
        """
        Puts backs into the matrix the bins
        removed
        """
        if len(self.orig_bin_ids) == 0:
            return
        # the rows to add are
        # as an empty sparse matrix
        M = self.matrix.shape[0]
        N = len(self.orig_bin_ids) - M
        rows_mat = csr_matrix((N, M))
        # cols to add
        cols_mat = csr_matrix((M + N, N))

        # add the rows and cols at the end of the
        # current matrix
        self.matrix = sparse_vstack([self.matrix, rows_mat])
        self.matrix = sparse_hstack([self.matrix, cols_mat], format='csr')

        # the new matrix has the right number of cols and rows, now
        # they need to be reordered to be back in their original places
        rows = cols = np.argsort(self.orig_bin_ids)
        self.matrix = self.matrix[rows, :][:, cols]
        self.cut_intervals = [self.orig_cut_intervals[x] for x in rows]
        self.interval_trees, self.chrBinBoundaries = \
            self.intervalListToIntervalTree(self.cut_intervals)
        # set as nan_bins the masked bins that were restored
        self.nan_bins = self.orig_bin_ids[M:]

        if self.correction_factors is not None:
            # add missing values as nans at end of array
            self.correction_factors = np.concatenate(
                [self.correction_factors,
                 np.repeat(np.nan, N)])
            # reorder array
            self.correction_factors = self.correction_factors[rows]

        # reset orig bins ids and cut intervals
        self.orig_bin_ids = []
        self.orig_cut_intervals = []
        log.info("masked bins were restored\n")
Example #7
0
    def _generate_feats(self, data, mode):
        # lexical feats
        #if mode == "train":
        #    self.tfidf_vect = TfidfVectorizer(ngram_range = self.ngram_rng, min_df = self.min_df, use_idf = self.use_idf)
        #    self.tfidf_vect.fit([x[1:-1] for x in list(data.text)]) # the x[1:-1] strips the initial and final [ and ] from the texts
        #feats = self.tfidf_vect.transform([x[1:-1] for x in list(data.text)])

        feats = self.transformer_model.encode(
            [x[1:-1] for x in list(data.text)])
        feats = np.array(feats)

        if self.use_utterance_feats:
            # utterance feats
            ut_feats = np.zeros((data.shape[0], 3))

            current_mid = data.iloc[0, 8]
            current_max_timestamp = max(
                data[data.meeting_id == current_mid].timestamp)
            for i in range(data.shape[0]):
                text = data.iloc[i, 2][1:-1]
                timestamp = data.iloc[i, 1]
                next_timestamp = data.iloc[i + 1, 1] if (
                    i + 1 < data.shape[0]
                    and data.iloc[i + 1, 8] == data.iloc[i, 8]) else None
                # first condition is for the end of the data frame (last utterance of the last meeting) second is on a breaking point between two meetings (happens for last utterance of every meeting)
                # without the second we would get 1853.2 as the last timestamp of meeting X and 0.0 as the first in meeting Y and the difference would be negative which messes up things down the line

                ut_feats[i, 0] = len(text.split(" "))  # length in words
                ut_feats[
                    i,
                    1] = next_timestamp - timestamp if next_timestamp is not None else 2.0  # 2.0 is just an arbitray approximate value for the duration of the last utterance of each meeting
                ut_feats[i, 2] = timestamp / current_max_timestamp

                if next_timestamp is None and i + 1 < data.shape[
                        0]:  # this is a breaking point between meetings and we have to update some of the vals for the next iteration
                    current_mid = data.iloc[i + 1, 8]
                    current_max_timestamp = max(
                        data[data.meeting_id == current_mid].timestamp)

            feats = csr_matrix(sparse_hstack([feats, csr_matrix(ut_feats)]))

        # expand all utterance level  feats to include feats of the prev and next utterances
        prev_context_feat_mats, next_context_feat_mats = [], []
        # prev context
        for offset in range(1, self.prev_context_len + 1):
            context_feats = feats[:-offset, :]
            padding = csr_matrix(np.zeros((offset, feats.shape[1])))
            final = sparse_vstack((padding, context_feats))
            prev_context_feat_mats.append(final)

        # next context
        for offset in range(1, self.next_context_len + 1):
            context_feats = feats[offset:, :]
            padding = csr_matrix(np.zeros((offset, feats.shape[1])))
            final = sparse_vstack((context_feats, padding))
            next_context_feat_mats.append(final)

        #feats = sparse_hstack([feats] + prev_context_feat_mats + next_context_feat_mats)

        if self.do_scaling:
            if mode == "train":
                self.scaler = StandardScaler(with_mean=False)
                self.scaler.fit(feats)
            feats = self.scaler.transform(feats)

        return feats
 def transform(self, X):
     bow = self.vectorizer.transform(X)
     d2v_emb = self.d2v_transform(X)
     combined_emb = sparse_hstack((bow, d2v_emb))
     return combined_emb
    def generate_setup(data, data_feats, label_type, feat_names, args, extra_feats,
                       extra_feat_names, label_name, feat_size=None, hp=None, fold_in=None):
        # important return stuf
        xval_res = None
        tfidf = None
        fs = None

        cw = "balanced"  # "balanced" or None
        # reg_types = ["l1","l2"]
        reg_types = [
            "l2"]  # preliminary experiments show that l2 + relatively low number of features in feat sel perform the best, but this might not always be the case

        n_base_feats = data_feats.shape[1] if data_feats is not None else 0
        n_extra_feats = extra_feats.shape[1] if extra_feats is not None else 0
        total_n_feats = n_base_feats + n_extra_feats

        max_features = 20000
        feat_sel_Ncandidates = [int(percentage * total_n_feats) for percentage in [0.05, 0.1, 0.2, 0.3, 0.4, 0.5]]
        if total_n_feats < max_features:
            feat_sel_Ncandidates += ["all"]  # if there are not a lot of feats also try a variant with all feats
        else:  # on the other hand, dont try more than 20k feats (more than that didn't appear to yield significant benefits in prelim. experiments)
            feat_sel_Ncandidates = [x for x in feat_sel_Ncandidates if x <= max_features] + [max_features]

        if feat_size:
            feat_sel_Ncandidates = [feat_size]

        if args.model == "lr":
            hyperparams = [tuple([2 ** i]) for i in range(-10, 5)]  # regularisation strength for the regression models
        elif args.model == "et":
            n_estimators = [100, 200, 300, 400, 500]
            mf = ["auto", 800] if total_n_feats >= 800 else ["auto"]
            bootstrap = [True]
            oob = [True, False]
            hyperparams = list(np.itertools.product(*[n_estimators, mf, bootstrap, oob]))
            feat_sel_Ncandidates = ["all"] if total_n_feats < max_features else [max_features]
        elif args.model == "dummy":
            hyperparams = [0]
        else:
            raise Exception("Unknown model type: " + str(args.model))

        if hp:
            hyperparams = [(hp,)]

        valid_indexes = data['label'].notnull()

        # filter out rows with nan vals for extra feats
        if extra_feats is not None:
            ll = np.isnan(extra_feats.todense()).any(axis=1)
            for ind in range(len(ll)):
                if ll[ind]:  # there is a nan in that row
                    valid_indexes[ind] = False
            print("Threw out " + str(np.sum(ll)))

        valid_indexes_numbers = np.where(valid_indexes)[0]

        filtered_data = data[valid_indexes]
        if data_feats is not None:
            filtered_data_feats = data_feats[valid_indexes_numbers, :]
        if extra_feats is not None:
            filtered_extra_feats = extra_feats[valid_indexes_numbers, :]

        folds_to_run = [0, 1, 2, 3, 4] if args.specificfold == -1 else [args.specificfold]
        if fold_in:
            folds_to_run = [fold_in]

        for fold in folds_to_run:
            # print("Starting fold " + str(fold))

            test_fold = fold
            val_fold = (fold + 1) % 5

            train_indexes = (filtered_data['fold'] != test_fold) & (filtered_data['fold'] != val_fold)
            train_indexes_numbers = np.where(train_indexes)[0]

            if data_feats is not None:
                train_feats = filtered_data_feats[train_indexes_numbers]

            if extra_feats is not None:
                train_extra_feats = filtered_extra_feats[train_indexes_numbers]

            train_labels = filtered_data[train_indexes]["label"]

            # apply tfidf weighting
            if data_feats is not None:
                # print("Applying tfidf for this fold.")
                tfidf = TfidfTransformer(sublinear_tf=True)
                train_feats = tfidf.fit_transform(train_feats)

            train_unames = list(filtered_data[train_indexes]["author"])

            # some fixes on the extra feats part
            if extra_feats is not None:
                # scaler = StandardScaler(with_mean = False)
                scaler = MinMaxScaler()

                train_extra_feats = csr_matrix(scaler.fit_transform(train_extra_feats.todense()))

            # combine word feats with all the other feats
            if data_feats is not None and extra_feats is None:
                combined_train_feats = train_feats
            elif data_feats is None and extra_feats is not None:
                combined_train_feats = csr_matrix(train_extra_feats)
            elif data_feats is not None and extra_feats is not None:
                for i in range(train_extra_feats.shape[0]):
                    if np.isnan(train_extra_feats.todense()[i, :]).any():
                        print("NAN FOUND FOR USER :"******"You must supply at least one type of features to use!")

            # run the many loops for testing various versions of this and that
            for feats_N in feat_sel_Ncandidates:
                fs = SelectKBest(chi2, k=feats_N) if label_type == "classification" else SelectKBest(f_regression,
                                                                                                     k=feats_N)
                if (feats_N == 0):
                    continue
                train_feats_FS = csr_matrix(fs.fit_transform(combined_train_feats, train_labels))

                def eval_hp(hype, r, l, c, ar, trf, trl):
                    model = PandoraAttGen.spawn_model(hype, r, l, c, ar)
                    model.fit(trf, trl)
                    # print("Finished for " + str(hype))
                    return model

                for reg in reg_types:
                    train_feats_FS.sort_indices()

                    xval_res = Parallel(n_jobs=12)(
                        delayed(eval_hp)(h, reg, label_type, cw, args, train_feats_FS, train_labels) for h in
                        hyperparams)

        print("*")
        print("*")
        print("*")
        print("*")

        return xval_res[0], fs, tfidf
Example #10
0
    def restoreMaskedBins(self):
        """
        Puts backs into the matrix the bins
        removed


        Examples
        --------
        >>> from scipy.sparse import coo_matrix
        >>> row, col = np.triu_indices(5)
        >>> cut_intervals = [('a', 0, 10, 1), ('a', 10, 20, 1),
        ... ('a', 20, 30, 1), ('a', 30, 40, 1), ('b', 40, 50, 1)]
        >>> hic = hiCMatrix()
        >>> hic.nan_bins = []
        >>> matrix = np.array([
        ... [ 0, 10,  5, 3, 0],
        ... [ 0,  0, 15, 5, 1],
        ... [ 0,  0,  0, 7, 3],
        ... [ 0,  0,  0, 0, 1],
        ... [ 0,  0,  0, 0, 0]], dtype=np.int32)

        make the matrix symmetric:
        >>> hic.matrix = csr_matrix(matrix + matrix.T)
        >>> hic.setMatrix(csr_matrix(matrix + matrix.T), cut_intervals)

        Add masked bins masked bins
        >>> hic.maskBins([3])
        >>> hic.matrix.todense()
        matrix([[ 0, 10,  5,  0],
                [10,  0, 15,  1],
                [ 5, 15,  0,  3],
                [ 0,  1,  3,  0]], dtype=int32)
        >>> hic.cut_intervals
        [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('b', 40, 50, 1)]

        >>> hic.restoreMaskedBins()
        >>> hic.matrix.todense()
        matrix([[ 0., 10.,  5.,  0.,  0.],
                [10.,  0., 15.,  0.,  1.],
                [ 5., 15.,  0.,  0.,  3.],
                [ 0.,  0.,  0.,  0.,  0.],
                [ 0.,  1.,  3.,  0.,  0.]])

        >>> hic.cut_intervals
        [('a', 0, 10, 1), ('a', 10, 20, 1), ('a', 20, 30, 1), ('a', 30, 40, 1), ('b', 40, 50, 1)]
        """
        if len(self.orig_bin_ids) == 0:
            return
        # the rows to add are
        # as an empty sparse matrix
        M = self.matrix.shape[0]
        N = len(self.orig_bin_ids) - M
        rows_mat = csr_matrix((N, M))
        # cols to add
        cols_mat = csr_matrix((M + N, N))

        # add the rows and cols at the end of the
        # current matrix
        self.matrix = sparse_vstack([self.matrix, rows_mat])
        self.matrix = sparse_hstack([self.matrix, cols_mat], format='csr')

        # the new matrix has the right number of cols and rows, now
        # they need to be reordered to be back in their original places
        rows = cols = np.argsort(self.orig_bin_ids)
        self.matrix = self.matrix[rows, :][:, cols]
        self.cut_intervals = [self.orig_cut_intervals[x] for x in rows]
        self.interval_trees, self.chrBinBoundaries = \
            self.intervalListToIntervalTree(self.cut_intervals)
        # set as nan_bins the masked bins that were restored
        self.nan_bins = self.orig_bin_ids[M:]

        if self.correction_factors is not None:
            # add missing values as nans at end of array
            self.correction_factors = np.concatenate(
                [self.correction_factors,
                 np.repeat(np.nan, N)])
            # reorder array
            self.correction_factors = self.correction_factors[rows]

        # reset orig bins ids and cut intervals
        self.orig_bin_ids = []
        self.orig_cut_intervals = []
        log.info("masked bins were restored\n")