def train(self): train, test = stratified_split(self.__data) X_train = train.iloc[:, 0].values y_train = train.iloc[:, 1].values X_test = test.iloc[:, 0].values y_test = test.iloc[:, 1].values self.clf_ = self.__cross_valid(X_train, y_train) logging.info('best clf:\n%s' % print_pipe_model(self.clf_)) # predict #cross_val_predict(clf, X_train, y_train, cv=5) y_train_pred = self.clf_.predict(X_train) y_test_pred = self.clf_.predict(X_test) logging.info('data shape: %s' % X_test.shape) #confusion matrix self.confusion_matrix_ = confusion_matrix(y_test, y_test_pred) logging.info('confusion matrix:\n%s' % pformat(self.confusion_matrix_)) # wrong prediction self.false_neg_ = test[(y_test == 1) & (y_test_pred == 0)] self.false_pos_ = test[(y_test == 0) & (y_test_pred == 1)] # evaluate self.train_score_ = evaluate(y_train, y_train_pred) self.test_score_ = evaluate(y_test, y_test_pred) # most importance features need_std = get_pipe_model(self.clf_, 'std') is None self.influence_ = self._get_influence_ft(self.clf_, X_train) if need_std else \ self._get_influence_ft(self.clf_) logging.info('Influence:\n%s\n...\n%s' % (self.influence_.head(), self.influence_.tail()))
def _get_influence_ft(self, model, X=None): tfidf = get_pipe_model(model, 'tfidf') features = tfidf.get_feature_names() influences = self._get_influences(model) if len(features) != len(influences): return pd.DataFrame( { 'feature': features, 'influence': np.zeros(len(features)) }, columns=['feature', 'influence']) if X is not None: influences = self._calc_std_influences(influences, tfidf.transform(X)) df = pd.DataFrame({ 'feature': features, 'influence': influences }, columns=['feature', 'influence']) df.sort_values(by='influence', ascending=False, inplace=True) return df
def get_decision_features(clf, x): tfidf = get_pipe_model(clf, 'tfidf') logre = get_pipe_model(clf, 'logre') # weights of features x_trans = get_pipe_transform(clf, x).toarray().reshape(-1) coef = logre.coef_.reshape(-1) weights = [ (i, v*w) for i, v, w in zip(range(len(x_trans)), x_trans, coef) if v and w ] weights = sorted(weights, key=itemgetter(1), reverse=True) # trim non-contributing weights y = logre.intercept_[0] for i, (_, w) in enumerate(weights): if y > 0: # the smaple has became positive break y += w weights = weights[:i] # contributing features and weight ratio features = tfidf.get_feature_names() w_sum = sum([w for _, w in weights]) decision_fts = [ (w/w_sum, features[i] ) for i, w in weights ] return decision_fts
def _get_influences(self, model): return get_pipe_model(model, -1).feature_importances_
def _get_influences(self, model): return get_pipe_model(model, -1).coef_[0]