def tune_parameters(self, file): data = dr(file).read_data(precision = self.precision) features = data[:, 2:-1] labels = data[:, -1].astype(int) fT, ft, lT, lt = train_test_split(features, labels, test_size = 0.5, random_state = 0) parameters = [{'C': [1, 5, 10, 50, 100, 500, 1000], 'tol': [1e-2, 1e-3, 1e-4, 1e-5, 1e-6], 'class_weight': ['balanced', None]}] clf = GridSearchCV(LogisticRegression(), parameters, cv = 2, scoring = make_scorer(precision_score, pos_label = 1)) clf.fit(fT, lT) print("-----\nBest parameters set found for Logistic Regression tuning precision:\n-----") print(clf.best_params_) print(classification_report(lt, clf.predict(ft)))
def validate_proba(self, file, proba_tol): data = dr(file).read_data(precision=self.precision) features = data[:, 2:-1] labels = data[:, -1].astype(int) result = [] k_fold = KFold(n_splits=2) for train, test in k_fold.split(features): self.train(features[train], labels[train]) preds = self.proba_util(features[test], proba_tol) result.append(precision_score(labels[test], preds, pos_label=1)) dr.print_report(self.debug, preds, labels[test]) #Print Classification Report return result
def predict(self, file): data = dr(file).read_data(precision=self.precision) features = data[:, 0:-1] labels = data[:, -1].astype(int) print('[INFO] ', np.count_nonzero(labels != 0), 'links present in training data, out of ', len(labels)) k_fold = KFold(n_splits=2) for train, test in k_fold.split(features): self.train(features[train], labels[train]) preds = self.ada.predict(features[test]) print('[INFO] ', np.count_nonzero(preds != 0), ' links retrieved by model') print('[INFO] Succesfully predicted ', np.count_nonzero(preds == labels[test]), ' links out of ', len(labels[test]))
from utils.data_reader import DataReader as dr # Build a forest and compute the feature importances forest = RandomForestClassifier(n_estimators=17, max_depth=5, criterion='entropy', random_state=0, class_weight='balanced') forest = AdaBoostClassifier(n_estimators=4000, learning_rate=1, algorithm='SAMME.R') forest = AdaBoostClassifier(n_estimators=50, learning_rate=1, algorithm='SAMME.R') data = dr('corpus_scores\\v2_5_raw_inv.txt').read_data(precision=-1) features = data[:, 2:-1] labels = data[:, -1].astype(int) forest.fit(features, labels) importances = forest.feature_importances_ std = np.std([tree.feature_importances_ for tree in forest.estimators_], axis=0) indices = np.argsort(importances)[::-1] # Print the feature ranking print("Feature ranking:") for f in range(features.shape[1]): print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))