def __init__(self):
     self.dataSets = os.listdir('data/')
     self.categories = ["geen_event", "sport","entertainment", "bijeenkomst", "incident", "anders"]
     # self.categories = ["wel_event", "geen_event", "sport","entertainment", "bijeenkomst", "incident", "anders"] # uncomment voor wel_event
     self.classifierAFeatures = ['wordFeatures']
     self.classifierBFeatures =  ['category', 'location','wordOverlapSimple','wordOverlapUser']
     self.annotation = {}
     self.candidates = {}
     self.result = defaultdict(self.resultDictionary)
     self.cm = []
     self.informativeFeatures = []
     self.accuracy  = []
     self.choice = 0
     
     # real test or dev test?
     self.realTest = False
     if len(sys.argv) == 2:
         if sys.argv[1] == "-test":
             self.realTest = True
     
     if self.realTest:
         print("\nThe system is running in TEST mode.\n")
         self.ITERATIONS = 1
     else:
         print("\nThe system is running in DEVTEST mode.\n")
         self.ITERATIONS = 10
     
     self.__loadDataSet()
     self.featureSelector = FeatureSelector(self.candidates)
     self._trainClassifiers()
     if self.realTest:
         self._saveClassifiers()
    def __init__(self):
        self.dataSets = os.listdir('data/')
        self.categories = [
            "geen_event", "sport", "entertainment", "bijeenkomst", "incident",
            "anders"
        ]
        # self.categories = ["wel_event", "geen_event", "sport","entertainment", "bijeenkomst", "incident", "anders"] # uncomment voor wel_event
        self.classifierAFeatures = ['wordFeatures']
        self.classifierBFeatures = [
            'category', 'location', 'wordOverlapSimple', 'wordOverlapUser'
        ]
        self.annotation = {}
        self.candidates = {}
        self.result = defaultdict(self.resultDictionary)
        self.cm = []
        self.informativeFeatures = []
        self.accuracy = []
        self.choice = 0

        # real test or dev test?
        self.realTest = False
        if len(sys.argv) == 2:
            if sys.argv[1] == "-test":
                self.realTest = True

        if self.realTest:
            print("\nThe system is running in TEST mode.\n")
            self.ITERATIONS = 1
        else:
            print("\nThe system is running in DEVTEST mode.\n")
            self.ITERATIONS = 10

        self.__loadDataSet()
        self.featureSelector = FeatureSelector(self.candidates)
        self._trainClassifiers()
        if self.realTest:
            self._saveClassifiers()
def training_and_classification_with_kfold_cross_validation(collection_name, k):
    '''
    Training and classification of an autotagger using k-fold cross validation
    '''
    _split_metadata_and_features(collection_name, k)
    for i in range(1,k+1):
        # Create a gaia dataset with the training set
        print "----------------------- DATASET CREATION (FOLD %d)-----------------------" % i
        training_features='train/%s_features__fold%d.tsv' % (collection_name, i)
        chunk_size=5000
        dataset_suffix="fold%d" % i
        replace_dataset=True
        dataset_creator = DatasetCreator(collection_name)
        dataset_creator.create(training_features, chunk_size, dataset_suffix, replace_dataset)
            
        # Feature selection over the gaia dataset
        print "----------------------- FEATURE SELECTION (FOLD %d)-----------------------" % i
        dataset='dbs/%s__fold%d.db' % (collection_name, i)
        pca_covered_variance=75
        include_highlevel=True
        feature_selector = FeatureSelector()
        feature_selector.select(dataset, pca_covered_variance, include_highlevel)
        
        # Autotag a given test set
        print "----------------------- AUTOTAGGING (FOLD %d)-----------------------" % i
        dataset='transformed_dbs/%s__fold%d.db' % (collection_name, i)
        training_metadata='train/%s_metadata__fold%d.tsv' % (collection_name, i)
        test_features='test/%s_features__fold%d.tsv' % (collection_name, i)
        output_binary='test/%s_output_binary__fold%d.tsv' % (collection_name, i)
        output_affinity='test/%s_output_affinity__fold%d.tsv' % (collection_name, i)
        metric='LC'
        num_sim=18
        threshold=0.2
        autotagger = Autotagger()
        autotagger.train(dataset, training_metadata)
        autotagger.classify(test_features, output_binary, metric, num_sim, threshold, ranked=False)
        autotagger.classify(test_features, output_affinity, metric, num_sim, threshold, ranked=True)
    def __init__(self):
        self.dataSets = os.listdir('data/')
        self.candidates = {}
        self._loadDataSet()
        featureSelector = FeatureSelector(self.candidates)
        #self.featuresCat = []
        #self.featuresBi = []
        self.events = []
        
        # detecteer events
        for h in self.candidates:
            for t in self.candidates[h]:
                candidate = self.candidates[h][t] 
                
                featuresCat = featureSelector.getFeatures(candidate, ['wordFeatures'])
                featureSelector.addCategoryClassifier(self.classifierCat)
                label = self.classifierCat.classify(featuresCat)

                featuresBi = featureSelector.getFeatures(candidate,['category', 'location','wordOverlapSimple','wordOverlapUser'])
                classifierBiLabel = self.classifierBi.classify(featuresBi)
                if classifierBiLabel != "geen_event":
                    self.events.append((candidate,classifierBiLabel))                          
Beispiel #5
0
    Y_train = pd.factorize(train[labelName])[0]
    X_train_origin = train.iloc[:, 0:train.columns.size - 1].copy()
    Y_test = pd.factorize(test[labelName])[0]
    X_test_origin = test.iloc[:, 0:test.columns.size - 1].copy()

    scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1))

    scaler.fit(X_train_origin)
    #scaling of training data
    X_train_origin = pd.DataFrame(scaler.transform(X_train_origin.copy()), columns=X_train_origin.columns)
    # apply same transformation to test data
    X_test_origin = pd.DataFrame(scaler.transform(X_test_origin.copy()), columns=X_test_origin.columns)

    trainTmp = X_train_origin.copy()
    trainTmp[labelName] = Y_train
    fs = FeatureSelector(trainTmp)

    featureSize = data.columns.size
    threshold = 10

    clfNames = ["lbfgs", "adam", "sgd", "randomForest", "decisionTree", "rbf", "poly", "linear", "knn"]

    while(featureSize >= threshold):
        features = fs.featureSelectionSelectKBestClassification(featureSize,labelName)
        print(features)
        clfs = [MLPClassifier(solver='lbfgs', alpha=10.0, hidden_layer_sizes=(150,), random_state=1, activation="tanh", max_iter=500),
                MLPClassifier(solver='adam', alpha=10.0, hidden_layer_sizes=(150,), random_state=1, activation="tanh", max_iter=500),
                MLPClassifier(solver='sgd', alpha=10.0, hidden_layer_sizes=(150,), random_state=1, activation="tanh", max_iter=500),
                RandomForestClassifier(n_estimators = 100, criterion = 'entropy', random_state = 42),tree.DecisionTreeClassifier(),
                svm.SVC(kernel='rbf', C=10.0, gamma=0.1, probability=True),
                svm.SVC(kernel='poly', C=10.0, degree=3, probability=True),
Beispiel #6
0
    for categorised in categorised_attributes:
        if categorised in df:
            df[categorised] = df[categorised].astype('category').cat.codes

    df.fillna(0, inplace=True)
    df = df.drop(columns=[x for x in ignored_attributes if x in df])

    return df


print("Loading training data...")
train_data_frame = pandas.read_csv('kaggle_data/train.csv')
train_data_frame = replace_dummies(train_data_frame)

print("Selecting top features for use...")
selector = FeatureSelector(train_data_frame, 'SalePrice')
top_features = selector.rank_features(70)

top_named_features = [train_data_frame.columns[x] for x in top_features]
top_named_features.append('SalePrice')

print("Top features: ")
print(top_named_features)

print("Reloading train data with only top features")
train_data_frame = pandas.read_csv('kaggle_data/train.csv',
                                   usecols=top_named_features)
train_data_frame = replace_dummies(train_data_frame)
train_data_frame = train_data_frame.drop(axis=1,
                                         columns=[
                                             x
Beispiel #7
0
    #preparing test and training for final evaluation: using copies not to create problems

    scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1))

    #don't cheat: fit only on training data
    scaler.fit(train)

    trainTmp = pd.DataFrame(scaler.transform(train.copy()), columns=train.columns)
    # apply same transformation to test data
    testTmp = pd.DataFrame(scaler.transform(test.copy()), columns=test.columns)

    fsSize = train.columns.size
    threshold = 10

    fs = FeatureSelector(trainTmp.copy())

    clfNames = ["lbfgs", "adam", "sgd", "randomForest", "decisionTree", "linear", "poly", "rbf", "Knn"]

    while(fsSize >= threshold):
        features = fs.featureSelectionSelectKBestRegression(fsSize, labelName)
        print("FEATURES NEL WHILE ", features)
        #C=1e3
        svr_rbf = SVR(kernel='rbf', C=1)
        svr_lin = SVR(kernel='linear', C=1)
        svr_poly = SVR(kernel='poly', C=1)

        clfs = [MLPRegressor(solver='lbfgs', alpha = 10.0, hidden_layer_sizes=(10,), activation="tanh",epsilon=1e-4),
                MLPRegressor(solver='adam', alpha=10.0, hidden_layer_sizes=(10,), activation="tanh", epsilon=1e-4),
                MLPRegressor(solver='sgd', alpha=10.0, hidden_layer_sizes=(10,), activation="tanh", epsilon=1e-4),
                RandomForestRegressor(n_jobs=10, random_state=45, n_estimators=10),  DecisionTreeRegressor(), svr_lin, svr_poly, svr_rbf,
 def __init__(self, input_, labels, train_percent, test_percent):
     self.input_ = input_
     self.labels = labels
     self.define_indices(train_percent, test_percent)
     self.featSel = FeatureSelector()
     self.logreg_clf = LogisticRegression()
class ClassifierMethods(object):
    def __init__(self, input_, labels, train_percent, test_percent):
        self.input_ = input_
        self.labels = labels
        self.define_indices(train_percent, test_percent)
        self.featSel = FeatureSelector()
        self.logreg_clf = LogisticRegression()

    def define_indices(self, train_percent, test_percent):

        n_rows = 4500  #input_[i].shape[0]
        train_end = int(n_rows * train_percent)
        test_end = int(train_end + n_rows * test_percent)
        self.train_indices = (0, train_end)
        self.test_indices = (train_end, test_end)
        self.validation_indices = (test_end, -1)

    def plot_roc_curve(self):
        pass

    def log_reg(self, X_train, y_train, X_test):
        self.logreg_clf.fit(X_train, y_train)
        return self.log_regclf.predict_proba(X_test), self.clf.predict(X_test)

    def rank_features(self):
        for i in range(9):
            X_train = self.input_[i][0:self.train_indices[1], :]
            y_train = (self.labels[0:self.train_indices[1]]).reshape(2700, )
            print('--------------------------------------------')
            print('Select k-best')
            print(self.featSel.kBest_score(X_train, y_train))
            print('Select extratrees')
            print(self.featSel.extree_score(X_train, y_train))
            print('----------------------------------------------')

    def class_rdnforest(self,
                        X_train_score,
                        X_input_score,
                        max_depth,
                        min_samples_split,
                        random_state,
                        X_train,
                        X_input,
                        y_train,
                        y_input,
                        time_ms,
                        figname='default_rdnfor.png',
                        last=False):
        rdm_for = RandomForestClassifier(max_depth=max_depth,
                                         random_state=random_state,
                                         min_samples_split=min_samples_split)
        start = time.process_time()
        rdm_for.fit(X_train, y_train)
        end = time.process_time()
        time_ms = (end - start) + time_ms

        if not last:
            X_train_score = X_train_score + (np.array(
                rdm_for.predict_proba(X_train)))
            X_input_score = X_input_score + (np.array(
                rdm_for.predict_proba(X_input)))
            return X_train_score, X_input_score, time_ms
        else:
            y_pred = np.array(rdm_for.predict(X_input))
            y_input.reshape(900, )
            classes = np.unique(y_input)
            plot_confussion_matrix(y_input,
                                   y_pred,
                                   classes,
                                   plot_name=figname,
                                   cmap=plt.cm.Blues,
                                   show=False)
            return time_ms

    def class_ada(self,
                  X_train,
                  y_train,
                  X_input,
                  y_input,
                  X_train_score,
                  X_input_score,
                  time_ms,
                  figname='default_ada.png',
                  last=False):
        ada_clf = AdaBoostClassifier(random_state=2, learning_rate=0.1)
        start = time.process_time()
        ada_clf.fit(X_train, y_train)
        end = time.process_time()
        time_ms = (end - start) + time_ms
        if not last:
            X_train_score = X_train_score + (np.array(
                ada_clf.predict_proba(X_train)))
            X_input_score = X_input_score + (np.array(
                ada_clf.predict_proba(X_input)))
            return X_train_score, X_input_score, time_ms

        else:
            y_pred = np.array(ada_clf.predict(X_input))
            y_input.reshape(900, )
            classes = np.unique(y_input)
            plot_confussion_matrix(y_input,
                                   y_pred,
                                   classes,
                                   plot_name=figname,
                                   cmap=plt.cm.Blues,
                                   show=False)
            return time_ms

    def train_and_class(self, test=False):

        rdn_for = np.zeros((900, 9))
        ada_score = np.zeros((900, 9))
        rdn_train = np.zeros((2700, 9))
        ada_train_score = np.zeros((2700, 9))
        time_rdfor = 0
        time_ada = 0

        for i in range(9):
            X_train = self.input_[i][0:self.train_indices[1], :]
            y_train = (self.labels[0:self.train_indices[1]]).reshape(2700, )

            if not test:
                X_input = self.input_[i][self.validation_indices[0]:, :]
                y_input = (self.labels[self.validation_indices[0]:]).reshape(
                    900, )
            else:
                X_input = self.input_[i][
                    self.test_indices[0]:self.test_indices[1], :]
                y_input = (self.labels[self.test_indices[0]:self.
                                       test_indices[1]]).reshape(900, )
            rdn_train, rdn_for, time_rdfor = self.class_rdnforest(
                time_ms=time_rdfor,
                X_input_score=rdn_for,
                X_train_score=rdn_train,
                max_depth=4,
                min_samples_split=2,
                random_state=2,
                X_train=X_train,
                X_input=X_input,
                y_train=y_train,
                y_input=y_input)
            ada_train_score, ada_score, time_ada = self.class_ada(
                time_ms=time_ada,
                X_train=X_train,
                X_input=X_input,
                X_train_score=ada_train_score,
                X_input_score=ada_score,
                y_input=y_input,
                y_train=y_train)

        time_rdfor = self.class_rdnforest(time_ms=time_rdfor,
                                          X_train_score=None,
                                          X_input_score=None,
                                          max_depth=4,
                                          min_samples_split=2,
                                          random_state=2,
                                          X_train=rdn_train,
                                          X_input=rdn_for,
                                          y_train=y_train,
                                          y_input=y_input,
                                          figname='rdfor_class.png',
                                          last=True)
        time_ada = self.class_ada(time_ms=time_ada,
                                  X_train_score=None,
                                  X_input_score=None,
                                  X_train=X_train,
                                  X_input=X_input,
                                  y_train=y_train,
                                  y_input=y_input,
                                  figname='ada_class.png',
                                  last=True)
        print('Training time for Random Forest: %f' % time_rdfor)
        print('Training time for AdaBoost: %f' % time_ada)

    def train_and_class_selfeat(self, kbest=True, test=False):

        rdn_for = np.zeros((900, 9))
        ada_score = np.zeros((900, 9))
        rdn_train = np.zeros((2700, 9))
        ada_train_score = np.zeros((2700, 9))
        time_rdfor = 0
        time_ada = 0

        for i in range(9):
            X_train = self.input_[i][0:self.train_indices[1], :]
            y_train = (self.labels[0:self.train_indices[1]]).reshape(2700, )
            if not test:
                X_input = self.input_[i][self.validation_indices[0]:, :]
                y_input = (self.labels[self.validation_indices[0]:]).reshape(
                    900, )
            else:
                X_input = self.input_[i][
                    self.test_indices[0]:self.test_indices[1], :]
                y_input = (self.labels[self.test_indices[0]:self.
                                       test_indices[1]]).reshape(900, )

            if kbest:
                print('Seleccionando k-mejores... K-best')
                X_train, X_input = self.featSel.kBest_fit(X_train=X_train,
                                                          X_input=X_input)
            else:
                print('Seleccionando por extra trees...')
                X_train, X_input = self.featSel.extree_fit(X_train=X_train,
                                                           X_input=X_input)

            rdn_train, rdn_for, time_rdfor = self.class_rdnforest(
                time_ms=time_rdfor,
                X_input_score=rdn_for,
                X_train_score=rdn_train,
                max_depth=4,
                min_samples_split=2,
                random_state=2,
                X_train=X_train,
                X_input=X_input,
                y_train=y_train,
                y_input=y_input)
            ada_train_score, ada_score, time_ada = self.class_ada(
                time_ms=time_ada,
                X_train=X_train,
                X_input=X_input,
                X_train_score=ada_train_score,
                X_input_score=ada_score,
                y_input=y_input,
                y_train=y_train)

        time_rdfor = self.class_rdnforest(
            time_ms=time_rdfor,
            X_train_score=None,
            X_input_score=None,
            max_depth=4,
            min_samples_split=2,
            random_state=2,
            X_train=rdn_train,
            X_input=rdn_for,
            y_train=y_train,
            y_input=y_input,
            figname=['kbestfeat_rdfor.png' if kbest else 'extree_rdfor.png'
                     ][0],
            last=True)
        time_ada = self.class_ada(
            time_ms=time_ada,
            X_train_score=None,
            X_input_score=None,
            X_train=X_train,
            X_input=X_input,
            y_train=y_train,
            y_input=y_input,
            figname=['kbestfeat_ada.png' if kbest else 'extree_ada.png'][0],
            last=True)
        print('Training time for Random Forest: %f' % time_rdfor)
        print('Training time for AdaBoost: %f' % time_ada)
Beispiel #10
0
# You should have received a copy of the GNU General Public License
# along with music-autotagging-msordo.  If not, see <http://www.gnu.org/licenses/>.

# Written by Mohamed Sordo (@neomoha)
# Email: mohamed ^dot^ sordo ^at^ gmail ^dot^ com
# Website: http://msordo.weebly.com

import os, sys, argparse

from FeatureSelector import FeatureSelector

if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='Feature selection over the Gaia dataset')
    parser.add_argument('collection_name', help='Name of the collection')
    parser.add_argument('--dataset', default=None, help='Path to the gaia dataset (default="dbs/COLLECTIONNAME.db")')
    parser.add_argument('--pca-covered-variance', type=int, default=75, help='The PCA transformation should keep at least this percentage of variance (default=75)')
    parser.add_argument('--exclude-highlevel', help='exclude high level descriptors', action="store_true")
    args = parser.parse_args()
    
    if args.dataset is None:
        args.dataset = "dbs/"+args.collection_name+".db"
    
    if not os.path.exists(args.dataset):
        print "Dataset '%s' not found" % args.dataset
        sys.exit(-1)
    
    print args

    feature_selector = FeatureSelector()
    feature_selector.select(args.dataset, args.pca_covered_variance, not args.exclude_highlevel)
import lightgbm as lgb
import xgboost as xgb
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

from FeatureSelector import FeatureSelector
from preproccess import read_data, prepare_and_scale_data

train, test = read_data()
train, y_train, test, id_test = prepare_and_scale_data(train, test)

feature_selector = FeatureSelector(train,test)
predictors = feature_selector.feature_selection_based_on_genetic_algo(train,test,y_train)

train = train[predictors]


en = LinearRegression(fit_intercept=True, n_jobs=-1)

rf = RandomForestRegressor(n_estimators=100, n_jobs=2, max_depth=6,)

et = ExtraTreesRegressor(n_estimators=100, n_jobs=4, max_depth=6,)

xgbm = xgb.sklearn.XGBRegressor(max_depth=6, learning_rate=0.05,
                                n_estimators=1000, base_score=y_train.mean())

lgbm = lgb.LGBMRegressor(nthread=3,silent=True,learning_rate=0.05,max_depth=7,n_estimators=1000)

Beispiel #12
0
    sample_df.columns
    sample_df.to_csv(submission_file, index=False)
    print 'Done'
    '''

    #FeatureSelector().select_features(write=True)
    #a = pd.Series(  DataProcessor().get_all_commands_series())
    #print a
    #commands = pd.Series(DataProcessor().get_all_commands_series())
    #print commands.keys()

    sample_df = pd.read_csv(sample_submission_file)
    result_df = pd.read_csv('outputs/FeatureSelector/all_500_500.csv')
    cols = select_k_best(result_df, 200)
    result_df = result_df[cols]
    result_df.loc[:, 'Label'] = FeatureSelector().get_labels_array_all()
    result_df.to_csv('outputs/FeatureSelector/selected_all.csv')
    v = pd.read_csv(validation_file)
    validation_set = v['Label']
    classification_res = []
    clf = LOF(n_neighbors=20, contamination=0.1)

    #for num in range(0, 40):
    #   print "******* User {} ********".format(num)
    #   ClassificationModel(user_num=num, df=result_df).optimize_parameters()

    for num in range(0, 10):
        print "******* User {} ********".format(num)
        classification_res.extend(
            ClassificationModel(user_num=num, df=result_df,
                                model=clf).predictLabels())
class ClassifierCreator:
    def __init__(self):
        self.dataSets = os.listdir('data/')
        self.categories = [
            "geen_event", "sport", "entertainment", "bijeenkomst", "incident",
            "anders"
        ]
        # self.categories = ["wel_event", "geen_event", "sport","entertainment", "bijeenkomst", "incident", "anders"] # uncomment voor wel_event
        self.classifierAFeatures = ['wordFeatures']
        self.classifierBFeatures = [
            'category', 'location', 'wordOverlapSimple', 'wordOverlapUser'
        ]
        self.annotation = {}
        self.candidates = {}
        self.result = defaultdict(self.resultDictionary)
        self.cm = []
        self.informativeFeatures = []
        self.accuracy = []
        self.choice = 0

        # real test or dev test?
        self.realTest = False
        if len(sys.argv) == 2:
            if sys.argv[1] == "-test":
                self.realTest = True

        if self.realTest:
            print("\nThe system is running in TEST mode.\n")
            self.ITERATIONS = 1
        else:
            print("\nThe system is running in DEVTEST mode.\n")
            self.ITERATIONS = 10

        self.__loadDataSet()
        self.featureSelector = FeatureSelector(self.candidates)
        self._trainClassifiers()
        if self.realTest:
            self._saveClassifiers()

    def __loadDataSet(self):
        for i, dataset in enumerate(self.dataSets):
            print("{}: {}".format(i, dataset))
        if self.realTest:
            self.choice = int(
                input("\nPlease select an annotated TRAIN dataset: "))
        else:
            self.choice = int(
                input("\nPlease select an annotated TRAIN/DEVTEST dataset: "))

        with open("data/" + self.dataSets[self.choice] +
                  "/sanitizedAnnotation.json") as jsonFile:
            self.annotation = json.load(jsonFile)

        with open("data/" + self.dataSets[self.choice] +
                  "/sanitizedEventCandidates.json") as jsonFile:
            self.candidates = json.load(jsonFile)

        if self.realTest:
            print()
            for i, dataset in enumerate(self.dataSets):
                print("{}: {}".format(i, dataset))
            choice = int(input("\nPlease select an annotated TEST dataset: "))

            with open("data/" + self.dataSets[choice] +
                      "/sanitizedEventCandidates.json") as jsonFile:
                self.testCandidates = json.load(jsonFile)

            #add to annotation file
            with open("data/" + self.dataSets[choice] +
                      "/sanitizedAnnotation.json") as jsonFile:
                self.testAnnotation = json.load(jsonFile)

    def _saveClassifiers(self):
        print("\nSaving the category and event classifier...")

        with open(
                "data/" + self.dataSets[self.choice] +
                "/categoryClassifier.bin", "wb") as f:
            pickle.dump(self.classifierA, f)

        with open(
                "data/" + self.dataSets[self.choice] + "/eventClassifier.bin",
                "wb") as f:
            pickle.dump(self.classifierB, f)

    def _selectDataset(self):
        dataset = []
        for h in self.candidates:
            for t in self.candidates[h]:
                dataset.append((self.candidates[h][t], self.eventType(h, t)))

        if self.realTest:
            # use all of the annotated train data to train
            self.trainData = dataset

            dataset = []
            for h in self.testCandidates:
                for t in self.testCandidates[h]:
                    dataset.append(
                        (self.testCandidates[h][t], self.eventType(h, t)))

            self.testData = dataset
        else:
            random.shuffle(dataset)
            # random dataset splits for cross validation
            trainSplit = int(0.8 * len(dataset))
            self.trainData = dataset[:trainSplit]
            self.testData = dataset[trainSplit:]

    def _trainClassifiers(self):
        print("\nClassifying events...\n")
        for i in range(self.ITERATIONS):
            if self.realTest:
                testMode = "TEST"
            else:
                testMode = "DEVTEST"
            print("###########")
            print("### {} {}".format(testMode, i + 1))
            print("#############")
            self._selectDataset()
            self.testA = []
            self.trainA = []
            self.testB = []
            self.trainB = []

            #first train category classifier
            print(
                "### TRAINING STEP 1: Training category classifier (Naive Bayes with word features) ###"
            )
            for candidate, label in self.testData:
                featuresA = self.featureSelector.getFeatures(
                    candidate, self.classifierAFeatures)
                self.testA.append((featuresA, label))

            for candidate, label in self.trainData:
                featuresA = self.featureSelector.getFeatures(
                    candidate, self.classifierAFeatures)
                self.trainA.append((featuresA, label))

            # MultinomialNB lijkt hier net zo goed als de nltk naive bayes classifier, maar is wel wat sneller
            self.classifierA = SklearnClassifier(MultinomialNB()).train(
                self.trainA)
            # sends the category classifier to the featureSelector
            self.featureSelector.addCategoryClassifier(self.classifierA)

            print(
                "### TRAINING STEP 2: Training event/non-event classifier (Naive Bayes with category & other features) ###"
            )
            # second step train the event/no event classifier (a second category classifier)
            for candidate, label in self.testData:
                featuresB = self.featureSelector.getFeatures(
                    candidate, self.classifierBFeatures)
                self.featureKeys = featuresB.keys()
                self.testB.append((featuresB, label))

            for candidate, label in self.trainData:
                featuresB = self.featureSelector.getFeatures(
                    candidate, self.classifierBFeatures)
                self.featureKeys = featuresB.keys()
                self.trainB.append((featuresB, label))

            self.classifierB = nltk.NaiveBayesClassifier.train(self.trainB)

            self.calculateStats(i)

        self.printStats()

    def resultDictionary(self):
        return defaultdict(list)

    def calculateStats(self, i):
        '''Function to calculate all stats'''
        #calculate cm for this iteration
        ref = []
        tagged = []
        for f, e in self.testB:
            ref.append(self.classifierB.classify(f))
            tagged.append(e)

        self.cm.append(nltk.ConfusionMatrix(ref, tagged))
        #self.informativeFeatures.append(self.classifierB.most_informative_features(10))
        print()
        #calculate precision and recall for this iteration for each category
        refsets = defaultdict(set)
        testsets = defaultdict(set)

        #allCount = 0
        #noEventCount = 0
        for n, (feats, label) in enumerate(self.testB):
            #allCount += 1
            #if label == "geen_event":
            #    noEventCount += 1
            refsets[label].add(n)
            observed = self.classifierB.classify(feats)
            # uncomment voor wel_event
            #if label != "geen_event":
            #    refsets["wel_event"].add(n)
            #if observed != "geen_event":
            #    testsets["wel_event"].add(n)
            testsets[observed].add(n)

        #print("Accuracy geen_event (baseline) is", noEventCount/allCount) #

        self.accuracy.append(
            nltk.classify.accuracy(self.classifierB, self.testB))

        #for elke category precision and recall berekenen.
        for category in self.categories:
            if category in testsets:
                self.result[category]["p"].append(
                    nltk.metrics.precision(refsets[category],
                                           testsets[category]))
                self.result[category]["r"].append(
                    nltk.metrics.recall(refsets[category], testsets[category]))
                self.result[category]["f"].append(
                    nltk.metrics.f_measure(refsets[category],
                                           testsets[category]))
            else:
                self.result[category]["p"].append(float(0))
                self.result[category]["r"].append(float(0))
                self.result[category]["f"].append(float(0))

    def eventType(self, geohash, timestamp):
        # return values {strings gebruiken?}
        eventTypes = {
            0: "geen_event",
            1: "sport",
            2: "entertainment",
            3: "bijeenkomst",
            4: "incident",
            5: "anders"
        }
        try:
            returnValue = eventTypes[self.annotation[geohash][timestamp]]
        except KeyError:
            returnValue = eventTypes[self.testAnnotation[geohash][timestamp]]

        return returnValue

    def printStats(self):
        print(", ".join(self.classifierBFeatures))
        it = self.ITERATIONS
        print("### EVALUATION STEP 1: Detailed statistics for the classifier:")
        for i in range(it):
            if self.realTest:
                testMode = "TEST"
            else:
                testMode = "DEVTEST"
            print("\n###########")
            print("### {} {}".format(testMode, i + 1))
            print("#############\n")
            print(self.cm[i])
            print("Most informative features")
        # print(self.informativeFeatures[i])
        print(
            "\n### EVALUATION STEP 2: Classification using features: {} | training set size: {} & test set size: {}\n"
            .format(", ".join(self.featureKeys), len(self.trainB),
                    len(self.testB)))
        headers = ['#', 'accuracy'] + self.categories

        prf = "P    R    F"
        table = [['', '', prf, prf, prf, prf, prf, prf]]
        for i in range(it):
            row = [i + 1, round(self.accuracy[i], 2)]
            for category in self.categories:
                value = "{:.2f} {:.2f} {:.2f}".format(
                    self.customRound(self.result[category]["p"][i], 2),
                    self.customRound(self.result[category]["r"][i], 2),
                    self.customRound(self.result[category]["f"][i], 2))
                row.extend([value])
            table.append(row)

        #averages
        row = ["Avg.", round(sum(self.accuracy) / len(self.accuracy), 2)]
        for category in self.categories:
            value = "{:.2f} {:.2f} {:.2f}".format(
                self.customAvg(self.result[category]["p"]),
                self.customAvg(self.result[category]["r"]),
                self.customAvg(self.result[category]["f"]))
            row.extend([value])
        table.append(row)

        print(tabulate.tabulate(table, headers=headers))
        print("\nLATEX table\n")
        print(tabulate.tabulate(table, headers=headers, tablefmt="latex"))

    def customAvg(self, l):
        try:
            returnValue = round(sum(l) / len(l), 2)
        except TypeError:
            returnValue = 0.0
        return returnValue

    def customRound(self, n, d):
        try:
            returnValue = round(n, d)
        except TypeError:
            returnValue = 0.0

        return returnValue
Beispiel #14
0
    def processor(self):
        pre_processor = PreProcessor()
        feature_extractor = FeatureExtractor()
        feature_selector = FeatureSelector()
        accuracy_checker = AccuracyChecker()
        y_train, x_train_sj, y_train_sj, x_train_iq, y_train_iq, x_test_sj, x_test_iq = self.read_data(
        )

        x_train_sj = pre_processor.impute_redundant_features(
            x_train_sj, self.impute_columns)
        x_train_iq = pre_processor.impute_redundant_features(
            x_train_iq, self.impute_columns)

        x_test_sj = pre_processor.impute_redundant_features(
            x_test_sj, self.impute_columns)
        x_test_iq = pre_processor.impute_redundant_features(
            x_test_iq, self.impute_columns)

        imputer_sj = Imputer(strategy='mean')
        x_train_sj = pre_processor.impute_missing_values(
            x_train_sj, self.features, imputer_sj)
        x_test_sj = pre_processor.impute_missing_values(
            x_test_sj, self.features, imputer_sj)

        imputer_iq = Imputer(strategy='mean')
        x_train_iq = pre_processor.impute_missing_values(
            x_train_iq, self.features, imputer_iq)
        x_test_iq = pre_processor.impute_missing_values(
            x_test_iq, self.features, imputer_iq)

        x_train_sj = feature_extractor.add_time_series_features(x_train_sj,
                                                                window=100)
        x_train_iq = feature_extractor.add_time_series_features(x_train_iq,
                                                                window=30)
        x_test_sj = feature_extractor.add_time_series_features(x_test_sj,
                                                               window=100)
        x_test_iq = feature_extractor.add_time_series_features(x_test_iq,
                                                               window=30)

        x_train_sj = feature_selector.drop_unnecessary_features(
            x_train_sj, self.drop_features, self.time_series_features)
        x_train_iq = feature_selector.drop_unnecessary_features(
            x_train_iq, self.drop_features, self.time_series_features)
        x_test_sj = feature_selector.drop_unnecessary_features(
            x_test_sj, self.drop_features, self.time_series_features)
        x_test_iq = feature_selector.drop_unnecessary_features(
            x_test_iq, self.drop_features, self.time_series_features)

        features_to_normalize = self.features + self.new_features

        x_train_sj[features_to_normalize] = x_train_sj[
            features_to_normalize].apply(pre_processor.normalize, axis=0)
        x_train_iq[features_to_normalize] = x_train_iq[
            features_to_normalize].apply(pre_processor.normalize, axis=0)
        x_test_sj[features_to_normalize] = x_test_sj[
            features_to_normalize].apply(pre_processor.normalize, axis=0)
        x_test_iq[features_to_normalize] = x_test_iq[
            features_to_normalize].apply(pre_processor.normalize, axis=0)

        x_train = pd.concat([x_train_sj, x_train_iq], axis=0)
        x_train.set_index('index', inplace=True)

        x_sj, y_sj = x_train.loc[x_train.city == 'sj', :], y_train.loc[
            x_train.city == 'sj', :]
        x_iq, y_iq = x_train.loc[x_train.city == 'iq', :], y_train.loc[
            x_train.city == 'iq', :]

        x_train_sj, x_cross_sj, y_train_sj, y_cross_sj = train_test_split(
            x_sj, y_sj, test_size=0.2, stratify=x_sj.weekofyear)

        x_train_iq, x_cross_iq, y_train_iq, y_cross_iq = train_test_split(
            x_iq, y_iq, test_size=0.2, stratify=x_iq.weekofyear)

        x_train_sj = feature_selector.select_features(x_train_sj,
                                                      self.features,
                                                      self.new_features)
        x_train_iq = feature_selector.select_features(x_train_iq,
                                                      self.features,
                                                      self.new_features)
        x_cross_sj = feature_selector.select_features(x_cross_sj,
                                                      self.features,
                                                      self.new_features)
        x_cross_iq = feature_selector.select_features(x_cross_iq,
                                                      self.features,
                                                      self.new_features)

        reg_sj_gb = GradientBoostingRegressor(learning_rate=0.1,
                                              max_depth=5,
                                              n_estimators=500,
                                              random_state=67)
        reg_iq_gb = GradientBoostingRegressor(learning_rate=0.1,
                                              max_depth=3,
                                              n_estimators=300,
                                              random_state=67)

        reg_sj_rf = RandomForestRegressor(max_depth=None,
                                          n_estimators=700,
                                          random_state=67)
        reg_iq_rf = RandomForestRegressor(max_depth=None,
                                          n_estimators=700,
                                          random_state=67)

        y_sj_pred_m1, y_iq_pred_m1 = self.model_trainor(
            reg_sj_gb, reg_iq_gb, x_train_sj, y_train_sj, x_train_iq,
            y_train_iq, x_cross_sj, x_cross_iq, "gb")
        y_sj_pred_m2, y_iq_pred_m2 = self.model_trainor(
            reg_sj_rf, reg_iq_rf, x_train_sj, y_train_sj, x_train_iq,
            y_train_iq, x_cross_sj, x_cross_iq, "rf")

        y_sj_pred, y_iq_pred = self.ensemble_model(y_sj_pred_m1, y_sj_pred_m2,
                                                   y_iq_pred_m1, y_iq_pred_m2,
                                                   5, 3)
        print("San Juan:")
        accuracy_checker.cross_validate_out_of_sample(y_sj_pred,
                                                      y_cross_sj.total_cases)
        print("Iquitos:")
        accuracy_checker.cross_validate_out_of_sample(y_iq_pred,
                                                      y_cross_iq.total_cases)

        predict_sj = x_test_sj[self.keys].copy()
        predict_iq = x_test_iq[self.keys].copy()

        x_sj = feature_selector.select_features(x_sj, self.features,
                                                self.new_features)
        x_iq = feature_selector.select_features(x_iq, self.features,
                                                self.new_features)
        x_test_sj = feature_selector.select_features(x_test_sj, self.features,
                                                     self.new_features)
        x_test_iq = feature_selector.select_features(x_test_iq, self.features,
                                                     self.new_features)

        reg_sj_gb = GradientBoostingRegressor(learning_rate=0.1,
                                              max_depth=5,
                                              n_estimators=500,
                                              random_state=67)
        reg_iq_gb = GradientBoostingRegressor(learning_rate=0.1,
                                              max_depth=3,
                                              n_estimators=300,
                                              random_state=67)

        reg_sj_rf = RandomForestRegressor(max_depth=None,
                                          n_estimators=700,
                                          random_state=67)
        reg_iq_rf = RandomForestRegressor(max_depth=None,
                                          n_estimators=700,
                                          random_state=67)

        y_sj_pred_m1, y_iq_pred_m1 = self.model_trainor(
            reg_sj_gb, reg_iq_gb, x_sj, y_sj, x_iq, y_iq, x_test_sj, x_test_iq,
            "gb")
        y_sj_pred_m2, y_iq_pred_m2 = self.model_trainor(
            reg_sj_rf, reg_iq_rf, x_sj, y_sj, x_iq, y_iq, x_test_sj, x_test_iq,
            "rf")
        y_sj_pred, y_iq_pred = self.ensemble_model(y_sj_pred_m1, y_sj_pred_m2,
                                                   y_iq_pred_m1, y_iq_pred_m2,
                                                   5, 3)
        predict_sj['total_cases'] = y_sj_pred.round().astype(int)
        predict_iq['total_cases'] = y_iq_pred.round().astype(int)

        predict_df = pd.concat([predict_sj, predict_iq], axis=0)
        predict_df.loc[predict_df.total_cases < 0, 'total_cases'] = 0

        self.write_results(predict_df)
Beispiel #15
0
    Y_test = pd.factorize(test[labelName])[0]
    X_test_origin = test.iloc[:, 0:test.columns.size - 1].copy()

    scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1))

    scaler.fit(X_train_origin)
    #scaling of training data
    X_train_origin = pd.DataFrame(scaler.transform(X_train_origin.copy()),
                                  columns=X_train_origin.columns)
    # apply same transformation to test data
    X_test_origin = pd.DataFrame(scaler.transform(X_test_origin.copy()),
                                 columns=X_test_origin.columns)

    trainTmp = X_train_origin.copy()
    trainTmp[labelName] = Y_train
    fs = FeatureSelector(trainTmp)

    featureSize = data.columns.size
    threshold = 10

    clfNames = [
        "lbfgs", "adam", "sgd", "randomForest", "decisionTree", "rbf", "poly",
        "linear", "knn"
    ]

    while (featureSize >= threshold):
        features = fs.featureSelectionSelectKBestClassification(
            featureSize, labelName)
        print(features)
        clfs = [
            MLPClassifier(solver='lbfgs',
Beispiel #16
0
    def fitInternal(self, currentData, parent_id, parent_edge,
                    parent_edge_operator, parent_data_type):
        #print("...... Start Fitting " + str(self.cur_num_node) + " nd node, parent id : " + str(parent_id) + ", parent edge :" + str(parent_edge) + " ...... \n")
        start = time.time()

        ###### Check terminate condition
        # 1. when dataset is empty
        if (currentData.shape[0] == 1):
            print("......  Fitting " + str(self.cur_num_node) +
                  " nd node, parent id : " + str(parent_id) +
                  ", parent edge :" + str(parent_edge) + " ...... Done!!!\n")
            print("Got empty dataset\n")
            return None

        curr_target, curr_target_count = np.unique(currentData[1:, -1].astype(
            np.float64),
                                                   return_counts=True)
        targetCompostion = None  #  composition number of unique target. we need this to calculate error training
        for count in curr_target_count:
            if (targetCompostion != None):
                targetCompostion += "|" + str(count)
            else:
                targetCompostion = str(count)

        self.cur_num_node += 1
        cur_idx = self.cur_num_node

        # 2. when target is already homogen
        if (len(curr_target) == 1):
            self.tree.append([
                cur_idx, parent_id, None, None, 0.0, 0.0, parent_edge,
                parent_edge_operator, 0.0, curr_target[0], targetCompostion,
                None, parent_data_type
            ])
            print("......  Fitting " + str(self.cur_num_node) +
                  " nd node, parent id : " + str(parent_id) +
                  ", parent edge :" + str(parent_edge) + " ...... Done!!!\n")
            return

        # 3. when no target or attribute
        if (currentData.shape[1] < 2):
            max_count, selected_class = self.selectMajorityClass(
                curr_target, curr_target_count)
            self.tree.append([
                cur_idx, parent_id, None, None, 0.0, 0.0, parent_edge,
                parent_edge_operator, 0.0, selected_class, targetCompostion,
                None, parent_data_type
            ])
            print("......  Fitting " + str(self.cur_num_node) +
                  " nd node, parent id : " + str(parent_id) +
                  ", parent edge :" + str(parent_edge) + " ...... Done!!!\n")
            return
        ###### Check terminate condition

        ## select a feature as current node
        splitter = FeatureSelector(self.pool_size, self.attributes_info,
                                   currentData, curr_target)
        splitter.doSelect()

        cur_attr_name = splitter.selected_attr_name
        cur_attr_idx = splitter.selected_attr_idx
        cur_threshold = splitter.selected_splitter
        cur_gain = splitter.selected_gain
        cur_split_info = splitter.selected_split_info
        thresholdCompostion = None  # composition number of unique edge of selected/current node. we need this to calculate error training

        if (
                cur_threshold != None and len(cur_threshold) == 1
        ):  # if feature is numeric feature. Assuming feature that has one unique edge is numeric feature.
            cur_feature_type = 'numeric'
            no_header_data = currentData[1:, :]
            cur_threshold = cur_threshold[0]

            # 1. recursively fit left child
            less_eq_data = no_header_data[np.where(
                no_header_data[:, cur_attr_idx].astype(
                    np.float64) <= cur_threshold.astype(np.float64))]
            less_eq_data = np.concatenate(([currentData[0, :]], less_eq_data),
                                          axis=0)
            XXY_new2 = np.delete(less_eq_data, cur_attr_idx, 1)
            thresholdCompostion = str(XXY_new2.shape[0] - 1)
            self.fitInternal(XXY_new2, cur_idx, cur_threshold, '<=',
                             cur_feature_type)
            #print("left child fit done. parent node: " + str(cur_attr_name)+"\n")

            # 2. recursively fit right child
            greater_data = no_header_data[np.where(
                no_header_data[:, cur_attr_idx].astype(
                    np.float64) > cur_threshold.astype(np.float64))]
            greater_data = np.concatenate(([currentData[0, :]], greater_data),
                                          axis=0)
            XXY_new2 = np.delete(greater_data, cur_attr_idx, 1)
            thresholdCompostion += "|" + str(XXY_new2.shape[0] - 1)
            self.fitInternal(XXY_new2, cur_idx, cur_threshold, '>',
                             cur_feature_type)
            #print("right child fit done. parent node: " + str(cur_attr_name)+"\n")

        elif (
                cur_threshold != None
        ):  # if feature is discrete feature. Assuming feature that has more than one unique edge is discrete feature.
            cur_feature_type = 'nominal'
            no_header_data = currentData[1:, :]
            cur_threshold_str = None
            for i in range(len(cur_threshold)):
                cur_threshold_str = str(
                    cur_threshold[i]
                ) if cur_threshold_str == None else cur_threshold_str + "|" + str(
                    cur_threshold[i])
                selected_dataset = no_header_data[np.where(
                    no_header_data[:, cur_attr_idx] == cur_threshold[i])]
                selected_dataset = np.concatenate(
                    ([currentData[0, :]], selected_dataset), axis=0)
                selected_dataset = np.delete(selected_dataset, cur_attr_idx, 1)
                thresholdCompostion = str(
                    selected_dataset.shape[0] - 1
                ) if thresholdCompostion == None else thresholdCompostion + "|" + str(
                    selected_dataset.shape[0] - 1)
                self.fitInternal(selected_dataset, cur_idx, cur_threshold[i],
                                 cur_feature_type)
            cur_threshold = cur_threshold_str
            print("left child fit done. parent node: " + str(cur_attr_name) +
                  "\n")
        else:
            print("cur idx: " + str(cur_idx))
            print(currentData)
            max_count, selected_class = self.selectMajorityClass(
                curr_target, curr_target_count)
            self.tree.append([
                cur_idx, parent_id, None, None, 0.0, 0.0, parent_edge,
                parent_edge_operator, 0.0, selected_class, targetCompostion,
                None, parent_data_type
            ])
            return

        # save tree to matrix. format : [treeIdx, parentId, attrName, colIndex, gain, splitInfo, parentEdge, threshold, leaveVal, targetCompostion, thresholdCompostion, cur_feature_type]
        col_idx = None if cur_attr_name == None else self.attributes_info[
            cur_attr_name]['col_idx']
        self.tree.append([
            cur_idx, parent_id, cur_attr_name, col_idx, cur_gain,
            cur_split_info, parent_edge, parent_edge_operator, cur_threshold,
            None, targetCompostion, thresholdCompostion, cur_feature_type
        ])

        print("......  Fitting " + str(self.cur_num_node) +
              " nd node, parent id : " + str(parent_id) + ", parent edge :" +
              str(parent_edge) + " ...... Done in " +
              str(time.time() - start) + "!!!\n")
class ClassifierCreator:

    def __init__(self):
        self.dataSets = os.listdir('data/')
        self.categories = ["geen_event", "sport","entertainment", "bijeenkomst", "incident", "anders"]
        # self.categories = ["wel_event", "geen_event", "sport","entertainment", "bijeenkomst", "incident", "anders"] # uncomment voor wel_event
        self.classifierAFeatures = ['wordFeatures']
        self.classifierBFeatures =  ['category', 'location','wordOverlapSimple','wordOverlapUser']
        self.annotation = {}
        self.candidates = {}
        self.result = defaultdict(self.resultDictionary)
        self.cm = []
        self.informativeFeatures = []
        self.accuracy  = []
        self.choice = 0
        
        # real test or dev test?
        self.realTest = False
        if len(sys.argv) == 2:
            if sys.argv[1] == "-test":
                self.realTest = True
        
        if self.realTest:
            print("\nThe system is running in TEST mode.\n")
            self.ITERATIONS = 1
        else:
            print("\nThe system is running in DEVTEST mode.\n")
            self.ITERATIONS = 10
        
        self.__loadDataSet()
        self.featureSelector = FeatureSelector(self.candidates)
        self._trainClassifiers()
        if self.realTest:
            self._saveClassifiers()

    def __loadDataSet(self):
        for i, dataset in enumerate(self.dataSets):
            print("{}: {}".format(i, dataset))
        if self.realTest:
            self.choice = int(input("\nPlease select an annotated TRAIN dataset: "))
        else:
            self.choice = int(input("\nPlease select an annotated TRAIN/DEVTEST dataset: "))
        
        with open("data/" + self.dataSets[self.choice] + "/sanitizedAnnotation.json") as jsonFile:
            self.annotation = json.load(jsonFile)

        with open("data/" + self.dataSets[self.choice] + "/sanitizedEventCandidates.json") as jsonFile:
            self.candidates = json.load(jsonFile)
         
        if self.realTest:
            print()
            for i, dataset in enumerate(self.dataSets):
                print("{}: {}".format(i, dataset))
            choice = int(input("\nPlease select an annotated TEST dataset: "))
                
            with open("data/" + self.dataSets[choice] + "/sanitizedEventCandidates.json") as jsonFile:
                self.testCandidates = json.load(jsonFile)

            #add to annotation file
            with open("data/" + self.dataSets[choice] + "/sanitizedAnnotation.json") as jsonFile:
                self.testAnnotation = json.load(jsonFile)

    def _saveClassifiers(self):
        print("\nSaving the category and event classifier...")
        
        with open("data/" + self.dataSets[self.choice] + "/categoryClassifier.bin", "wb") as f:
            pickle.dump(self.classifierA,f)
            
        with open("data/" + self.dataSets[self.choice] + "/eventClassifier.bin", "wb") as f:
            pickle.dump(self.classifierB,f)
    
    def _selectDataset(self):
        dataset = []
        for h in self.candidates:
            for t in self.candidates[h]:
                dataset.append( (self.candidates[h][t], self.eventType(h,t) ) )
                
        if self.realTest:
            # use all of the annotated train data to train
            self.trainData = dataset
            
            dataset = []
            for h in self.testCandidates:
                for t in self.testCandidates[h]:
                    dataset.append( (self.testCandidates[h][t], self.eventType(h,t) ) )
                    
            self.testData = dataset
        else:
            random.shuffle(dataset)
            # random dataset splits for cross validation
            trainSplit = int(0.8 * len(dataset))
            self.trainData = dataset[:trainSplit]
            self.testData = dataset[trainSplit:]
        
    def _trainClassifiers(self):
        print("\nClassifying events...\n")
        for i in range(self.ITERATIONS):
            if self.realTest:
                testMode = "TEST"
            else:
                testMode = "DEVTEST"
            print("###########")
            print("### {} {}".format(testMode,i+1))
            print("#############")
            self._selectDataset()
            self.testA = []
            self.trainA = []
            self.testB = []
            self.trainB = []
           
            #first train category classifier
            print("### TRAINING STEP 1: Training category classifier (Naive Bayes with word features) ###")
            for candidate, label in self.testData:
                featuresA = self.featureSelector.getFeatures(candidate, self.classifierAFeatures)
                self.testA.append((featuresA, label))         
            
            for candidate, label in self.trainData:
                featuresA = self.featureSelector.getFeatures(candidate, self.classifierAFeatures)
                self.trainA.append((featuresA, label))

            # MultinomialNB lijkt hier net zo goed als de nltk naive bayes classifier, maar is wel wat sneller
            self.classifierA = SklearnClassifier(MultinomialNB()).train(self.trainA)
            # sends the category classifier to the featureSelector
            self.featureSelector.addCategoryClassifier(self.classifierA)
                
            print("### TRAINING STEP 2: Training event/non-event classifier (Naive Bayes with category & other features) ###")
            # second step train the event/no event classifier (a second category classifier)
            for candidate, label in self.testData:
                featuresB = self.featureSelector.getFeatures(candidate, self.classifierBFeatures)   
                self.featureKeys = featuresB.keys()
                self.testB.append((featuresB, label)) 
            
            for candidate, label in self.trainData:
                featuresB = self.featureSelector.getFeatures(candidate, self.classifierBFeatures)
                self.featureKeys = featuresB.keys()
                self.trainB.append((featuresB, label))

            self.classifierB = nltk.NaiveBayesClassifier.train(self.trainB)

            self.calculateStats(i)
            
        self.printStats()

    def resultDictionary(self):
        return defaultdict(list)

    def calculateStats(self, i):
        '''Function to calculate all stats'''
        #calculate cm for this iteration
        ref = []
        tagged =[]
        for f, e in self.testB:
            ref.append(self.classifierB.classify(f))
            tagged.append(e)

        self.cm.append(nltk.ConfusionMatrix(ref, tagged))
        #self.informativeFeatures.append(self.classifierB.most_informative_features(10))
        print()
        #calculate precision and recall for this iteration for each category
        refsets = defaultdict(set)
        testsets = defaultdict(set)

        #allCount = 0
        #noEventCount = 0
        for n, (feats, label) in enumerate(self.testB):
            #allCount += 1
            #if label == "geen_event":
            #    noEventCount += 1
            refsets[label].add(n)
            observed = self.classifierB.classify(feats)
            # uncomment voor wel_event
            #if label != "geen_event":
            #    refsets["wel_event"].add(n)
            #if observed != "geen_event":
            #    testsets["wel_event"].add(n)
            testsets[observed].add(n)
            
        #print("Accuracy geen_event (baseline) is", noEventCount/allCount) #

        self.accuracy.append(nltk.classify.accuracy(self.classifierB,self.testB))

        #for elke category precision and recall berekenen.
        for category in self.categories:
            if category in testsets:
                self.result[category]["p"].append(nltk.metrics.precision(refsets[category], testsets[category]))
                self.result[category]["r"].append(nltk.metrics.recall(refsets[category], testsets[category]))
                self.result[category]["f"].append(nltk.metrics.f_measure(refsets[category], testsets[category]))
            else:
                self.result[category]["p"].append(float(0))
                self.result[category]["r"].append(float(0))
                self.result[category]["f"].append(float(0))

    def eventType(self,geohash,timestamp):
        # return values {strings gebruiken?}
        eventTypes = {0:"geen_event", 1:"sport", 2:"entertainment", 3:"bijeenkomst", 4:"incident", 5:"anders"}
        try:          
            returnValue  = eventTypes[self.annotation[geohash][timestamp]]
        except KeyError:
            returnValue  = eventTypes[self.testAnnotation[geohash][timestamp]]

        return returnValue

    def printStats(self):
        print(", ".join(self.classifierBFeatures))
        it = self.ITERATIONS
        print("### EVALUATION STEP 1: Detailed statistics for the classifier:")
        for i in range(it):
            if self.realTest:
                testMode = "TEST"
            else:
                testMode = "DEVTEST"
            print("\n###########")    
            print("### {} {}".format(testMode,i+1))
            print("#############\n")
            print(self.cm[i])
            print("Most informative features")
           # print(self.informativeFeatures[i])
        print("\n### EVALUATION STEP 2: Classification using features: {} | training set size: {} & test set size: {}\n".format(", ".join(self.featureKeys),len(self.trainB), len(self.testB)))
        headers = ['#', 'accuracy'] + self.categories
        
        prf = "P    R    F"
        table = [ ['', '', prf, prf,prf,prf,prf,prf]]
        for i in range(it):
            row = [i + 1, round(self.accuracy[i],2)]
            for category in self.categories:
                value = "{:.2f} {:.2f} {:.2f}".format(self.customRound(self.result[category]["p"][i],2), self.customRound(self.result[category]["r"][i],2), self.customRound(self.result[category]["f"][i],2))
                row.extend( [value] )
            table.append(row)
        
        #averages
        row = ["Avg.", round(sum(self.accuracy) / len(self.accuracy),2)]
        for category in self.categories:
            value = "{:.2f} {:.2f} {:.2f}".format(self.customAvg(self.result[category]["p"]), self.customAvg(self.result[category]["r"]), self.customAvg(self.result[category]["f"]))
            row.extend( [value] )
        table.append(row)

        print(tabulate.tabulate(table, headers=headers))
        print("\nLATEX table\n")
        print(tabulate.tabulate(table, headers=headers, tablefmt="latex"))

    def customAvg(self, l):
        try:
            returnValue = round(sum(l) / len(l),2)
        except TypeError:
            returnValue = 0.0
        return returnValue

    def customRound(self,n, d):
        try:
            returnValue = round(n,d)
        except TypeError:
            returnValue = 0.0

        return returnValue