def featureSelectionTree(data):
    label = data[:,1]
    datanew = data[:,2:]
    for i in range(0,len(datanew)):
        datanew[i] = map(abs, datanew[i])

    clf = ExtraTreesClassifier()
    X_new = clf.fit(datanew, label).transform(datanew)
    size = len(X_new[0])
    data[:,2:size+2] = X_new
    fd = open('History.txt','a')
    history = 'Feature Selection: Tree' + '\n' + 'Selected Feature: ' + str(clf.get_support(True)) + '\n'
    fd.write(history)
    fd.close()
    return data[:,:size+2], size
class Model:
    def __init__(self):
        # self.features_selector = VarianceThreshold(threshold=(.8 * (1 - .8)))
        # self.features_selector = SelectKBest(k="all")
        # self.features_selector = SelectPercentile(score_func=SelectFpr, percentile=16)
        self.features_selector = ExtraTreesClassifier(n_estimators=250,
                                                      max_features=20)

        self.dict_vectorizer = DictVectorizer()
        self.scaler = StandardScaler(copy=True)

    def vectorize(self, X, y, fit=True):
        # digitize categories
        if fit:
            self.dict_vectorizer.fit(X)
        X = self.dict_vectorizer.transform(X).toarray()
        return X, y

    def scale(self, X, y, fit=True):
        # scale numbers
        if fit:
            self.scaler.fit(X)
        X = self.scaler.transform(X)
        return X, y

    def all_feature_names(self):
        return self.dict_vectorizer.get_feature_names()

    def selected_feature_names(self):
        names = []
        all_names = np.array(self.all_feature_names())
        return all_names[self.feats['ensemble']]

        # if hasattr(self.features_selector, 'get_support'):
        #     for i in self.features_selector.get_support(indices=True):
        #         names.append(all_names[i])
        # else:
        #     feature_importance = self.features_selector.feature_importances_
        #     feature_importance = 100.0 * (feature_importance / feature_importance.max())
        #     sorted_idx = np.argsort(feature_importance)[::-1]
        #     names = np.array(self.all_feature_names()[:len(sorted_idx)])
        #     """
        #     for name, imp in zip(names[sorted_idx], feature_importance[sorted_idx]):
        #         # i = indices[f]
        #         print "%s (%f)" % (name, imp),
        #     """
        # sel_count = int(math.log(len(sorted_idx), 2))
        # return names[sorted_idx][:self.features_selector.max_features]

    def save_features(self, X, y):
        feats = dict()

        print "univariate feature selectors"
        selector_clf = SelectKBest(score_func=f_classif, k='all')
        selector_clf.fit(X, y)
        pvalues_clf = selector_clf.pvalues_
        pvalues_clf[np.isnan(pvalues_clf)] = 1

        #put feature vectors into dictionary
        feats['univ_sub01'] = (pvalues_clf < 0.1)
        feats['univ_sub005'] = (pvalues_clf < 0.05)
        feats['univ_clf_sub005'] = (pvalues_clf < 0.05)

        print "randomized logistic regression feature selector"
        sel_log = linear_model.RandomizedLogisticRegression(random_state=42,
                                                            n_jobs=4).fit(
                                                                X, y)
        #put rand_lasso feats into feature dict
        feats['rand_logreg'] = sel_log.get_support()

        print "l1-based feature selectors"
        X_sp = sparse.coo_matrix(X)
        sel_svc = svm.LinearSVC(C=0.1,
                                penalty="l1",
                                dual=False,
                                random_state=42).fit(X, y)
        feats['LinearSVC'] = np.ravel(sel_svc.coef_ > 0)
        sel_log = linear_model.LogisticRegression(C=0.01, random_state=42).fit(
            X_sp, y)
        feats['LogReg'] = np.ravel(sel_log.coef_ > 0)

        tree_max_features = 20
        print "ExtraTrees feature selectors (%s)" % tree_max_features
        feats['tree'] = np.zeros(len(feats['LogReg']))
        tree = ExtraTreesClassifier(n_estimators=250,
                                    max_features=tree_max_features)
        tree.fit(X, y)
        feature_importance = tree.feature_importances_
        feature_importance = 100.0 * (feature_importance /
                                      feature_importance.max())
        sorted_idx = np.argsort(feature_importance)[::-1]
        for i in xrange(tree_max_features):
            feats['tree'][sorted_idx[i]] = 1

        feat_sums = np.zeros(len(feats['LogReg']))
        for key in feats:
            feat_sums += feats[key].astype(int)
        feats[
            'ensemble'] = feat_sums >= 4  #take features which get 5 or more votes
        joblib.dump(feats, 'features/feats.pkl', compress=3)
        return feats

    def load_features(self):
        return joblib.load('features/feats.pkl')

    def select_features(self, X, y, fit=True):
        if fit:
            # self.features_selector.fit(X,y)
            # print "Selected Features:"
            # print self.selected_feature_names()
            # print
            self.feats = self.save_features(X, y)
            # pass
        # self.feats = self.load_features()
        # X = self.features_selector.transform(X)
        print "Selected Features:"
        print self.selected_feature_names()
        print
        return X[:, self.feats['ensemble']], y

    def split_data(self, X, y, ids, cross_validate):
        if not cross_validate:
            return X, [], y, [], ids, []

        # append ids so we can identify who is in test and who is in train set
        X = np.c_[X, ids]
        # split data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=.3)  # , random_state=0
        # store ids
        train_ids = X_train[:, -1]
        test_ids = X_test[:, -1]
        # remove ids
        X_train = np.delete(X_train, -1, 1).astype(np.float)
        X_test = np.delete(X_test, -1, 1).astype(np.float)

        return X_train, X_test, y_train, y_test, train_ids, test_ids

    def get_columns_from_selected_features(self, featureNames):
        all_names = self.all_feature_names()
        featureNames = set(featureNames)
        columns = []

        for i, j in enumerate(
                self.features_selector.get_support(indices=True)):
            if all_names[j] in featureNames:
                columns.append(i)

        return columns

    def get_columns_for_features(self, featureNames):
        all_names = self.all_feature_names()
        cols = []
        for feature in featureNames:
            cols.append(all_names.index(feature))
        return cols

    def standard_prepare(self, X, y, fit=True, cross_validate=True):
        X, y = self.vectorize(X, y, fit)
        X, y = self.select_features(X, y, fit)

        X = np.array(X)
        y = np.array(y)
        self.X_unscaled = X
        self.y_unscaled = y

        X, y = self.scale(X, y, fit)
        self.X_scaled = X
        self.y_scaled = y

        return X, y

    def prepare(self, X, y, ids, fit=True, cross_validate=True):
        X, y = self.standard_prepare(X, y, fit, cross_validate)
        self.ids = ids
        self.X_train, self.X_test, self.y_train, self.y_test, self.train_ids, self.test_ids = self.split_data(
            X, y, ids, cross_validate)

    def apply_set(self, bidders):
        # TODO: first apply filtering - then split the data
        self.uX_scaled = []
        self.uy_scaled = []
        n = len(self.ids)
        for i in xrange(n):
            if self.ids[i] in bidders:
                self.uX_scaled.append(self.X_scaled[i])
                self.uy_scaled.append(self.y_scaled[i])
Beispiel #3
0
# this part is used if we want to fit an svm to find important variables
# commented out as it was decided that logistic regression gave a better result
# lsvc = LinearSVC(C=0.01, penalty="l1", dual=False).fit(Xx, yy)
# model = SelectFromModel(lsvc, prefit=True)
# feat_list_3 = Xx.iloc[:,list(model.get_support(indices=True))].columns

# this part is used if we want to fit logistic regression to find important variables
lr = LogisticRegression(C=0.000000001,
                        penalty='l2',
                        dual=False,
                        solver='lbfgs').fit(Xx, yy)
model = SelectFromModel(lr, prefit=True)

# takes the found to be important column names according to importance in logistic regression
feat_list_3 = Xx.iloc[:, list(model.get_support(indices=True))].columns

# this is a list of all the common variables across all three variable seleciton methods
# these can likely be considered quite important as all three methods have picked them up
# list(set(feat_list_1) & set(feat_list_2) & set(feat_list_3))

# this 'master' list combines all the elements from the three variable selection methods
# overall it reduces our dimensionality from 79 down to 33, which definitely makes a more robust model (in most cases)
important_cols = list(
    set(list(feat_list_1) + list(feat_list_2) + list(feat_list_3)))

# as we have reduce the number of features taken through our training/testing
# we must also reduce our final testing
X_test_final = df_test.loc[:, important_cols].values

# this is the main split; taking all the important columns from our