Beispiel #1
0
	def tune_parameters(self, file):
		data = dr(file).read_data(precision = self.precision)
		features = data[:, 2:-1]
		labels = data[:, -1].astype(int)
		
		fT, ft, lT, lt = train_test_split(features, labels, test_size = 0.5, random_state = 0)
		parameters = [{'C': [1, 5, 10, 50, 100, 500, 1000], 'tol': [1e-2, 1e-3, 1e-4, 1e-5, 1e-6], 
			'class_weight': ['balanced', None]}]
		
		clf = GridSearchCV(LogisticRegression(), parameters, cv = 2, scoring = make_scorer(precision_score, pos_label = 1))
		clf.fit(fT, lT)
		print("-----\nBest parameters set found for Logistic Regression tuning precision:\n-----")
		print(clf.best_params_)
		print(classification_report(lt, clf.predict(ft)))
Beispiel #2
0
    def validate_proba(self, file, proba_tol):
        data = dr(file).read_data(precision=self.precision)
        features = data[:, 2:-1]
        labels = data[:, -1].astype(int)

        result = []
        k_fold = KFold(n_splits=2)
        for train, test in k_fold.split(features):
            self.train(features[train], labels[train])
            preds = self.proba_util(features[test], proba_tol)
            result.append(precision_score(labels[test], preds, pos_label=1))
            dr.print_report(self.debug, preds,
                            labels[test])  #Print Classification Report
        return result
Beispiel #3
0
    def predict(self, file):
        data = dr(file).read_data(precision=self.precision)
        features = data[:, 0:-1]
        labels = data[:, -1].astype(int)

        print('[INFO] ', np.count_nonzero(labels != 0),
              'links present in training data, out of ', len(labels))
        k_fold = KFold(n_splits=2)
        for train, test in k_fold.split(features):
            self.train(features[train], labels[train])
            preds = self.ada.predict(features[test])
            print('[INFO] ', np.count_nonzero(preds != 0),
                  ' links retrieved by model')
            print('[INFO] Succesfully predicted ',
                  np.count_nonzero(preds == labels[test]), ' links out of ',
                  len(labels[test]))
Beispiel #4
0
from utils.data_reader import DataReader as dr

# Build a forest and compute the feature importances
forest = RandomForestClassifier(n_estimators=17,
                                max_depth=5,
                                criterion='entropy',
                                random_state=0,
                                class_weight='balanced')
forest = AdaBoostClassifier(n_estimators=4000,
                            learning_rate=1,
                            algorithm='SAMME.R')
forest = AdaBoostClassifier(n_estimators=50,
                            learning_rate=1,
                            algorithm='SAMME.R')

data = dr('corpus_scores\\v2_5_raw_inv.txt').read_data(precision=-1)
features = data[:, 2:-1]
labels = data[:, -1].astype(int)

forest.fit(features, labels)
importances = forest.feature_importances_
std = np.std([tree.feature_importances_ for tree in forest.estimators_],
             axis=0)
indices = np.argsort(importances)[::-1]

# Print the feature ranking
print("Feature ranking:")

for f in range(features.shape[1]):
    print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))