Esempio n. 1
0
def test_classifier(dataset, classifier):

    print("-----TESTING CLASSIFIER-----")

    if isinstance(classifier, keras.engine.training.Model):

        x_test, y_test = dl.prepare_data_for_RNN(dataset)

        print("-----TEST SET SIZE: " + str(x_test["sentence1"].shape) +
              "-----")
        scores = classifier.evaluate(x_test, y_test)
        print("\n%s: %.2f%%" % (classifier.metrics_names[1], scores[1] * 100))

    elif isinstance(classifier, keras.models.Sequential):

        x_test, y_test = dl.prepare_data_for_NN(dataset)

        print("-----TEST SET SIZE: " + str(len(x_test)) + "-----")
        scores = classifier.evaluate(x_test, y_test)
        print("\n%s: %.2f%%" % (classifier.metrics_names[1], scores[1] * 100))

    elif isinstance(classifier,
                    sklearn.ensemble.forest.RandomForestClassifier):

        x_test, y_test = dl.prepare_data_for_RF(dataset)
        print("-----TEST SET SIZE: " + str(len(x_test)) + "-----")

    else:
        x_test, y_test = dl.prepare_data_for_ZeroR(dataset)
        print("-----TEST SET SIZE: " + str(len(x_test)) + "-----")

    prediction = classifier.predict(x_test)

    numberOfClasses = y_test.shape[1]

    position = np.argmax(prediction, axis=-1)
    y_pred = np.identity(numberOfClasses)[position]

    target_names = ['nonrelated', 'related']
    print(classification_report(y_test, y_pred, target_names=target_names))

    y_test = [np.where(r == 1)[0][0] for r in y_test]
    y_pred = [np.where(r == 1)[0][0] for r in y_pred]

    y_true = pd.Series(y_test)
    y_pred = pd.Series(y_pred)

    print(
        pd.crosstab(y_true,
                    y_pred,
                    rownames=['True'],
                    colnames=['Predicted'],
                    margins=True))
Esempio n. 2
0
def train_NN_classifier(dataset, epochs, singlePrint=False):

    print("-----TRAIN CLASSIFIER-----")

    x_train, y_train = dl.prepare_data_for_NN(dataset)
    numberOfClasses = y_train.shape[1]

    model = Sequential()
    model.add(Dense(500, input_dim=len(x_train[0]), activation='sigmoid'))
    model.add(Dense(numberOfClasses, activation='softmax'))
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    if (singlePrint):
        history = model.fit(x_train,
                            y_train,
                            validation_split=0.2,
                            epochs=epochs,
                            batch_size=150,
                            verbose=0)
        print(history.history["acc"])
    else:
        history = model.fit(x_train,
                            y_train,
                            validation_split=0.2,
                            epochs=epochs,
                            batch_size=150)

    plot_Training(history)

    print("-----TRAINING COMPLETE-----")
    return model
Esempio n. 3
0
def baseline_results(dataset):

    x_test, y_test = dl.prepare_data_for_ZeroR(dataset)

    prediction = np.random.choice(2, x_test.shape[0])
    numberOfLabels = np.unique(prediction).shape[0]
    prediction = np.identity(numberOfLabels)[prediction.astype(int).flatten()]

    numberOfClasses = y_test.shape[1]

    position = np.argmax(prediction, axis=-1)
    y_pred = np.identity(numberOfClasses)[position]

    target_names = ['nonrelated', 'related']
    print(classification_report(y_test, y_pred, target_names=target_names))

    y_test = [np.where(r == 1)[0][0] for r in y_test]
    y_pred = [np.where(r == 1)[0][0] for r in y_pred]

    y_true = pd.Series(y_test)
    y_pred = pd.Series(y_pred)

    print(
        pd.crosstab(y_true,
                    y_pred,
                    rownames=['True'],
                    colnames=['Predicted'],
                    margins=True))
Esempio n. 4
0
def train_Dummy_classifier(train):

    x_train, y_train = dl.prepare_data_for_ZeroR(train)

    classifier = DummyClassifier(strategy="stratified", random_state=0)
    classifier.fit(x_train, y_train)

    return classifier
Esempio n. 5
0
    def balance_data(self, dataset, balancing=0.5):
        """balances the passed data
            parameter:
            dataset: pandas dataframe containing the data
            balancing: precentage of the balancing -> 0.5 = equal 50-50 balncing
        """

        dataset = dl.balance_dataset(dataset, balancing)

        return dataset
Esempio n. 6
0
    def split_data(self, dataset, splitting=0.1):
        """splits data into training and testset
            parameter:
            dataset: pandas dataframe containing the training data
            splitting: percentage split of the data 0.1 = 10% testing data
            return values: trainset, testset
        """

        train, test = dl.supervised_split(dataset, splitting)

        return train, test
Esempio n. 7
0
def train_RF_classifier(dataset):

    print("-----TRAIN CLASSIFIER-----")

    x_train, y_train = dl.prepare_data_for_RF(dataset)

    estimators = 200

    randomForest = RandomForestClassifier(n_estimators=estimators)

    randomForest.fit(x_train, y_train)

    print("-----TRAINING COMPLETE-----")
    return randomForest
Esempio n. 8
0
    def load_Data(self, dataDirectory=None, store=None):
        """load the data from XML-file and automatically extracts the features
            parameter:
            dataDirectory: directory of the XML-file
            store: directory to the h5.file for storing the loaded data and features
        """

        data = dl.loadData(dataDirectory)
        dataset = self.generate_Features(data)

        if store != None:
            dataStore = pd.HDFStore(store)
            dataStore['dataset'] = data
            dataStore['feature'] = dataset

        return dataset
Esempio n. 9
0
def train_RNN_classifier(dataset, epochs, singlePrint=False):

    print("-----TRAIN CLASSIFIER-----!")

    x_train, y_train = dl.prepare_data_for_RNN(dataset)

    numberOfClasses = y_train.shape[1]
    print("Number of classes:" + str(numberOfClasses))

    lstm_input_dim = x_train["sentence1"].shape[1:]
    concatenateInput = x_train["sharedFeatures"].shape[1:]

    sentence1 = Input(lstm_input_dim, name="sentence1")
    sentence2 = Input(lstm_input_dim, name="sentence2")
    sharedFeatures = Input(concatenateInput, name="sharedFeatures")

    lstm1 = LSTM(16, return_sequences=False)(sentence1)
    lstm2 = LSTM(16, return_sequences=False)(sentence2)
    concatenateLayer = concatenate([lstm1, lstm2, sharedFeatures], axis=-1)
    dense = Dense(500, activation='sigmoid')(concatenateLayer)
    softmax = Dense(numberOfClasses, activation='softmax')(dense)

    model = Model(inputs=[sentence1, sentence2, sharedFeatures],
                  outputs=[softmax])
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    if (singlePrint):
        history = model.fit(x_train,
                            y_train,
                            validation_split=0.2,
                            epochs=epochs,
                            batch_size=150,
                            verbose=0)
        print(history.history["acc"])
    else:
        history = model.fit(x_train,
                            y_train,
                            validation_split=0.2,
                            epochs=epochs,
                            batch_size=500)

    plot_Training(history)

    print("-----TRAINING COMPLETE-----")
    return model
Esempio n. 10
0
    def show_dataset_statistics(self):
        """print the statistics of the datasets"""

        print("AraucariaDB")
        dl.loadStatistics("resources/corpora/araucaria")
        print("microtext")
        dl.loadStatistics("resources/corpora/microtext")
        print("rrd")
        dl.loadStatistics("resources/corpora/rrd")
        print("schemes")
        dl.loadStatistics("resources/corpora/schemes")
        print("STAB")
        dl.loadStatistics("resources/corpora/studentEssays")
        print("IBM")
        dl.loadStatistics("resources/corpora/ibm")
        print("ArguE")
        dl.loadStatistics("resources/corpora/arguE")
Esempio n. 11
0
class main:

    current_dir = os.path.dirname(inspect.stack()[0][1]) + '/'
    aif = current_dir + "resources/datasets/aif.h5"
    se = current_dir + "resources/datasets/se.h5"
    ibm = current_dir + "resources/datasets/ibm.h5"
    argu = current_dir + "resources/datasets/arguE.h5"

    aifTrain = current_dir + "resources/datasets/training/aifTrain.h5"
    aifTest = current_dir + "resources/datasets/testing/aifTest.h5"
    seTrain = current_dir + "resources/datasets/training/seTrain.h5"
    seTest = current_dir + "resources/datasets/testing/seTest.h5"
    ibmTrain = current_dir + "resources/datasets/training/ibmTrain.h5"
    ibmTest = current_dir + "resources/datasets/testing/ibmTest.h5"
    argueTrain = current_dir + "resources/datasets/training/argueTrain.h5"
    argueTest = current_dir + "resources/datasets/testing/argueTest.h5"

    arguE = ArguE()
    
    ####### Build resources if not existing #######
    if not os.path.exists(seTrain):
        if not os.path.exists(se):
            se_data = dl.loadData((current_dir + 'resources/datasets/brat-project/'))
            AFE = af.AdvancedFeatureExtractor()
            se_data = AFE.extractFeatures(se_data)
            store = pd.HDFStore(se,'w')
            store["feature"] = se_data
            store.close()
            print("SE generated")
        se_data = arguE.load_Data_From_Store(se)
        se_train, se_test = arguE.split_data(se_data)
        store = pd.HDFStore(seTrain,'w')
        store["feature"] = se_train
        store.close()
        store = pd.HDFStore(seTest,'w')
        store["feature"] = se_test
        store.close()
        print("Train-test generated")
        

    #######################################################################

    ####### Training #######

    print("################## TRAINING:")

    #data is already balanced and labels are changed
    trainSet = arguE.load_Data_From_Store(seTrain)
    trainSet = arguE.change_labels(trainSet)
    trainSet = arguE.balance_data(trainSet)

    #OneR = arguE.train_Dummy_classifier(trainSet, current_dir + "resources/classifierModels/se_or.pkl")
    #RF = arguE.train_RF_classifier(trainSet, current_dir+ "resources/classifierModels/all_rf.pkl")
    RNN = arguE.train_RNN_classifier(trainSet, epochs=25, saveModel=current_dir + "resources/classifierModels/se_rnn.h5")

    ####### Testing #######

    print("################## TESTING:")

    testSet = arguE.load_Data_From_Store(seTest)
    testSet = arguE.change_labels(testSet)

    #arguE.test_classifier(testSet, OneR)

    #arguE.test_classifier(testSet, RF)

    arguE.test_classifier(testSet, RNN)