Exemple #1
0
def main():
    # get the the path for the input file argument
    parser = argparse.ArgumentParser()
    parser._action_groups.pop()
    required = parser.add_argument_group('required arguments')
    optional = parser.add_argument_group('optional arguments')
    required.add_argument('-tr', '--train_image', help='image used for training the algorithm', required=True)
    required.add_argument('-te', '--test_image', help='image to evaluate', required=True)
    optional.add_argument('-l', '--log', dest="logLevel", choices=['DEBUG', 'debug', 'INFO', 'info', 'ERROR', 'error'],
                          help='Argument use to set the logging level')
    optional.add_argument('-knn', '--knn', help='flag to run knn', action='store_true')

    args = parser.parse_args()

    logger_initialization(log_level=args.logLevel)

    logging.getLogger('regular.time').info('starting running handwritten-notes script')

    digits, y_train = load_digits(args.train_image)

    x_train = pixels_to_hog_20(digits)

    num_pixels = x_train.shape[1]
    num_classes = len(np.unique(y_train))

    if args.knn:
        logging.getLogger('regular.time').info('training knn model')
        model = KNeighborsClassifier()
        model.fit(x_train, y_train)
    else:
        logging.getLogger('regular.time').info('training NN model')
        model = Sequential()
        model.add(Dense(num_pixels, input_dim=num_pixels, kernel_initializer='normal', activation='relu'))
        model.add(Dense(num_classes, kernel_initializer='normal', activation='softmax'))   
        # Compile model
        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    process_test_image(dataset=args.test_image, model=model, model_type=args.knn)
Exemple #2
0
class AnomaLog():
    def __init__(self, multiclass=True, model_type='DNN'):
        """
        The __init__ method is the initializer of the class.

        :param multiclass: it has to be True in order to use (fit, evaluate and use
                           as analyzer) a multiclass classifier, or False to use a
                           2-class classifier (True by default)
        :param model_type: it can be 'DNN' (Deep Neural Network), 'SVM' (Support
                           Vector Machine), 'DT' (Decision Tree) 'RF' (Random
                           Forest), or 'KNN' (K-Nearest Neighbors) in order to use
                           the respective model as classifier ('DNN' by default)
        """
        self.normal_class = None
        self.classes = None
        self.classes_names = None
        self.fields = None
        self.external_units = 10
        self.internal_units = 50
        self.loss = 'categorical_crossentropy'
        self.optimizer = 'adam'
        self.norm_bias = 0
        self.model_type = model_type
        self.multiclass = multiclass
        self.dummies = dict()

    def dataset_reader(self, filename, column_names=None):
        """
        The dataset_reader method allows to read a csv file and returns the dataset,
        dropping the samples having missing values, and to set the columns names.

        :param filename: it is the name of the file (with its path) which contains
                              the dataset
        :param column_names: it has to be a list of strings in order to set the
                              columns names, None otherwise (None by default)

        :return: the dataframe representing the dataset
        """
        aux_df = pd.read_csv(filename, header=None)
        aux_df.dropna(inplace=True, axis=1)
        df = aux_df.copy()
        del aux_df
        if not (column_names is None):
            df.columns = column_names
        return df

    def _model_creation(self, data=None, labels=None):
        """
        The _model_creation method is used in order to create the model when the fit
        method is called.

        :param data: it is the training set (None by default)
        :param labels: it is the list of labels (None by default)
        """
        if self.model_type == 'DNN':
            input_dimension = data.shape[1]
            output_dimension = labels.shape[1]

            self.model = Sequential()
            self.model.add(Dense(self.external_units,
                                 input_dim=input_dimension,
                                 activation='relu'))
            self.model.add(Dense(self.internal_units,
                                 input_dim=input_dimension,
                                 activation='relu'))
            self.model.add(Dense(self.external_units,
                                 input_dim=input_dimension,
                                 activation='relu'))
            self.model.add(Dense(1, kernel_initializer='normal'))
            self.model.add(Dense(output_dimension, activation='softmax'))
            self.model.compile(loss=self.loss, optimizer=self.optimizer)
        elif self.model_type == 'SVM':
            self.model = CalibratedClassifierCV(LinearSVC())
        elif self.model_type == 'DT':
            self.model = DecisionTreeClassifier()
        elif self.model_type == 'RF':
            self.model = RandomForestClassifier()
        elif self.model_type == 'KNN':
            self.model = KNeighborsClassifier(int(data.shape[1] ** (1 / 2)))

    def fit(self, x_train, y_train, validation_fraction=0, epochs_number=1000,
            pat=5):
        """
        The fit method is used in order to create and fit the classifier.

        :param x_train: it is the training set
        :param y_train: it is the list of training labels
        :param validation_fraction: is the fraction of samples to use as validation
                                    samples, and in this case the validation loss
                                    will be evaluated in order to decide when to
                                    stop the training of the classifier (0 by
                                    default, used only for the DNN model)
        :param epochs_number: it is the maximum number of training epochs, if the
                                    classifier does not stop automatically its
                                    fitting before (1000 by default, used only for
                                    the DNN model)
        :param pat: it is the patience, so the maximum number of epochs without any
                                    improvement in the validation loss (if
                                    validation_fraction is not equal to zero) or in
                                    the loss (otherwise) after which the classifier
                                    stops its training (5 by default, used only for
                                    the DNN model)
        """
        if self.model_type == 'DNN':
            self._DNN_fit(x_train, y_train, validation_fraction, epochs_number, pat)
        else:
            self._model_creation()
            self.model.fit(x_train, np.argmax(y_train, axis=1))

    def _DNN_fit(self, x_train, y_train, validation_fraction=0,
                 epochs_number=1000, pat=5):
        """
        The _DNN_fit method is used in order to create and fit the DNN classifier,
        by automatically stopping the training phase after a predefined number of
        epochs or after a certain number of epochs without any improvement in the
        considered loss metric.

        :param x_train: it is the training set
        :param y_train: it is the list of training labels
        :param validation_fraction: is the fraction of samples to use as validation
                                    samples, and in this case the validation loss
                                    will be evaluated in order to decide when to
                                    stop the training of the classifier (0 by
                                    default)
        :param epochs_number: it is the maximum number of training epochs, if the
                                    classifier does not stop automatically its
                                    fitting before (1000 by default)
        :param pat: it is the patience, so the maximum number of epochs without any
                                    improvement in the validation loss (if
                                    validation_fraction is not equal to zero) or in
                                    the loss (otherwise) after which the classifier
                                    stops its training (5 by default)
        """
        self._model_creation(x_train, y_train)
        if validation_fraction == 0:
            monitor = EarlyStopping(monitor='loss',
                                    min_delta=1e-3,
                                    patience=pat,
                                    verbose=1,
                                    mode='auto',
                                    restore_best_weights=True)
            self.model.fit(x_train, y_train, verbose=2, epochs=epochs_number,
                           callbacks=[monitor])
        else:
            x_train, x_val, y_train, y_val = self.split_data(x_train,
                                                             y_train,
                                                             validation_fraction)
            monitor = EarlyStopping(monitor='val_loss',
                                    min_delta=1e-3,
                                    patience=pat,
                                    verbose=1,
                                    mode='auto',
                                    restore_best_weights=True)
            self.model.fit(x_train, y_train, validation_data=(x_val, y_val),
                           callbacks=[monitor], verbose=2, epochs=epochs_number)

    def _encode_text(self, df, name, analysisFLAG=False):
        """
        The _encode_text method is used by the compute_dataset method in order to
        encode the textual elements of a column as dummy values.

        :param df: it is the dataframe which represents the dataset
        :param name: it is the name of the column on which compute the dummy values
        :param analysisFLAG: it has to be True if the encoding is related to an
                             analysis of a raw dataset by using a previously fitted
                             analyzer, False otherwise (False by default)

        :return: the managed dataframe which represents the dataset
        """
        if analysisFLAG is False:
            dummies = pd.get_dummies(df[name])
            self.dummies[name] = dummies.columns
        else:
            aux = df[name]
            columns = self.dummies[name]
            aux = aux.astype(pd.CategoricalDtype(categories=columns))
            dummies = pd.get_dummies(aux)
        for x in dummies.columns:
            dummy_name = f"{name}-{x}"
            df[dummy_name] = dummies[x]
        df.drop(name, axis=1, inplace=True)
        return df

    def compute_dataset(self, df, labels_column=None, normal_class=None, analysisFLAG=False):
        """
        The compute_dataset method computes the dataset by extracting the dummy
        values corresponding to the textual columns and dropping the rows showing
        missing values.

        :param df: it is the dataframe which represents the dataset
        :param labels_column: it is the name of the column in which are contained
                              the labels related to the samples, and if it is None
                              then all the columns will be considered in computing
                              the features and no labels list will be computed (None
                              by default)
        :param normal_class: it is the name of the non-anomalous class which will be
                              managed and associated to an attribute of the object
                              in order to use it in the evaluation step (optional,
                              None by default)
        :param analysisFLAG: it has to be False if the dataset has to be computed
                              during the fitting of the system, True otherwise
                              (False by default)

        :return: the dataset of the features, and the list of labels (if
                             labels_column is not None)
        """
        df_columns = df.columns
        if not (labels_column is None):
            self.fields = []
            for column in df_columns:
                if column != labels_column:
                    self.fields.append(column)

        aux_df = df.copy()
        aux = df.values[0]
        for i in range(len(df_columns) - 1):
            aux = aux_df[df_columns[i]].values
            if type(aux[0]) is str:
                aux_df = self._encode_text(aux_df, df_columns[i], analysisFLAG)
        aux_df.dropna(inplace=True, axis=1)

        if labels_column is None:
            x = aux_df.to_numpy()
            return x
        else:
            if not (normal_class is None):
                idx = self._class_index(aux_df, normal_class, labels_column)
                if self.multiclass is False:
                    for i in range(len(aux_df[labels_column])):
                        if aux_df[labels_column].values[i] != normal_class:
                            aux_df[labels_column].values[i] = 'Anomalous'

            x_columns = aux_df.columns.drop(labels_column)
            x = aux_df[x_columns].values
            dummies = pd.get_dummies(aux_df[labels_column])
            y = dummies.values

            self._find_classes(aux_df, labels_column, y)

            if not (normal_class is None):
                self.normal_class = int(np.argmax(y[idx]))
            return x, y

    def _find_classes(self, df, labels_column, y):
        """
        The _find_classes method is used by the compute_dataset method in order to
        store the indexes associated to each class label and the ordered names in
        classes and classes_names attributes, respectively.

        :param df: it is the dataframe representing the dataset
        :param labels_column: it is the name of the column in which are contained
                              the labels related to the samples, and if it is None
                              then all the columns will be considered in computing
                              the features and no labels list will be computed (None
                              by default)
        :param y: it is the list of labels
        """
        unique_df = df.drop_duplicates(subset=[labels_column])
        classes = unique_df[labels_column].values.tolist()
        self.classes = []
        self.classes_names = []
        for class_name in classes:
            idx = self._class_index(df, class_name, labels_column)
            self.classes.append(np.argmax(y[idx]))
            self.classes_names.append(class_name)

    def _class_index(self, df, class_name, labels_column):
        """
        The _class_index method is used in order to find the index of the first
        sample of the dataset belonging to a specific class.

        :param df: it is the dataframe representing the dataset
        :param class_name: it is the class of which search the index of the first
                              sample in the dataset
        :param labels_column: it is the name of the column in which are contained
                              the labels related to the samples, and if it is None
                              then all the columns will be considered in computing
                              the features and no labels list will be computed (None
                              by default)

        :return: the index of the first sample belonging to the chosen class
        """
        labels = df[labels_column].values.tolist()
        return labels.index(class_name)

    def split_data(self, data, labels, test_fraction, idx_flag=False):
        """
        The split_data method allows to randomly split the dataset and the related
        list of labels, in order to obtain two different dataset (training set and
        test set) and the related list of labels, and eventually indexes related to
        the samples in the original dataset.

        :param data: it is the dataset which has to be splitted
        :param labels: it is the list of labels
        :param test_fraction: it is the fraction of samples of the whole dataset
                              which have to be used as test set (used as fraction of
                              the whole dataset if a value less than 1 is used, as
                              the number of testing samples otherwise)
        :param idx_flag: it has to be True in order to return also the lists of
                         training and testing indexes (False by default)

        :return: the training set, the test set, the training labels and the test
                 labels if idx_flag = False, also the list of training indexes and
                 the list of test indexes otherwise (all in numpy.array format)
        """

        n_samples = len(data)

        if test_fraction < 1:
            test_fraction = int(test_fraction * n_samples)
        idx = list(np.random.permutation(n_samples))
        idx_test = np.asarray(idx[0:test_fraction])
        idx_train = np.asarray(idx[test_fraction:])
        X_train = np.asarray([data[i] for i in idx_train])
        X_test = np.asarray([data[i] for i in idx_test])
        y_train = np.asarray([labels[i] for i in idx_train])
        y_test = np.asarray([labels[i] for i in idx_test])
        if idx_flag:
            return X_train, X_test, y_train, y_test, idx_train, idx_test
        return X_train, X_test, y_train, y_test

    def evaluate_performance(self, x_test, y_test, norm_bias=0, aucFLAG=False):
        """
        The evaluate_performance method is used to test the classifier, and to show
        the related performance.

        :param x_test: is the test set
        :param y_test: is the list of test labels
        :param norm_bias: it is the value to subtract to the normal class score
                          before predicting the classes to which the samples belong,
                          reducing the missed alarms but incrementing the false
                          positive rate if its value is greater than zero (0 by
                          default)
        :param aucFLAG: it has to be True in order to return also the AUC (Area
                          Under the Curve) value in case of a binary (2-class)
                          classification, False otherwise (False by default)

        :return: the mean accuracy, the false negatives (normal) number, the false
                 positives (anomalous) number, the true positives number, the
                 true negatives number, and the AUC value (if aucFLAG is True in a
                 binary classification)
        """
        if self.model_type == 'DNN':
            pred = self.model.predict(x_test)
        else:
            pred = self.model.predict_proba(x_test)
        pred = self._apply_bias(pred, norm_bias)
        y_pred = np.argmax(pred, axis=1)
        y_eval = np.argmax(y_test, axis=1)
        score, falseNormal, falseAnomalous, trueAnomalous, trueNormal = self._classification_metrics(y_eval, y_pred,
                                                                                                     x_test)
        if self.multiclass is False:
            auc = self._roc(y_pred, pred)
            if aucFLAG is True:
                return score, falseNormal, falseAnomalous, trueAnomalous, trueNormal, auc
        return score, falseNormal, falseAnomalous, trueAnomalous, trueNormal

    def _apply_bias(self, pred, norm_bias):
        """
        The _apply_bias method subtract a static bias from the scores related to the
        normal class.

        :param pred: it is the scores matrix
        :param norm_bias: it is the bias value

        :return: the manages scores matrix
        """
        self.norm_bias = norm_bias
        if norm_bias != 0:
            for i in range(len(pred)):
                pred[i][self.normal_class] += norm_bias
        return pred

    def _classification_metrics(self, y_eval, y_pred, x_test):
        """
        The _classification_metrics method is used to compute some performance
        evaluations on the classifier.

        :param y_eval: is the list of evaluated test labels
        :param y_pred: is the list of predicted test labels
        :param x_test: is the test dataset

        :return: the mean accuracy, the false negatives (normal) number, the false
                 positives (anomalous) number, the true positives number and the
                 true negatives number
        """
        N = len(y_pred)
        if self.model_type == 'DNN':
            score = metrics.accuracy_score(y_eval, y_pred)
        else:
            score = self.model.score(x_test, y_eval)
        print("Accuracy: {}".format(score))
        print()
        if not (self.normal_class is None):
            totalNormal = np.sum([x == self.normal_class for x in y_eval])
            totalAnomalous = len(y_eval) - totalNormal
            falseNormal = np.sum([((y_eval[x] != y_pred[x]) and y_pred[x] == self.normal_class) for x in range(N)])
            falseAnomalous = np.sum([((y_eval[x] != y_pred[x]) and y_pred[x] != self.normal_class) for x in range(N)])
            trueAnomalous = totalAnomalous - falseNormal
            trueNormal = totalNormal - falseAnomalous
            self._confusion_matrix(trueNormal, falseNormal, trueAnomalous, falseAnomalous)
            print()
            print(metrics.classification_report(y_eval, y_pred, labels=self.classes, target_names=self.classes_names))
        return score, falseNormal, falseAnomalous, trueAnomalous, trueNormal

    def _roc(self, y_true, y_score):
        """
        The _roc method shows the ROC (Receiver operating characteristic) curve
        related to the positive (anomalous) class, and computes the correspondent
        AUC (Area Under the Curve) value (this method is used in case of 2-class
        classification).

        :param y_true: it is the list of labels
        :param y_score: it is the matrix which represents the scores related to each
                        class for each sample

        :return: the auc value
        """
        idx = 1
        if self.normal_class == 1:
            idx = 0
        fpr, tpr, _ = metrics.roc_curve(y_true, y_score[:, idx], pos_label=idx)
        roc_auc = metrics.auc(fpr, tpr)
        plt.figure()
        lbl = 'ROC curve (area = %0.4f)' % roc_auc
        plt.plot(fpr, tpr, color='darkorange', label=lbl)
        plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
        plt.xlim([-0.01, 1.0])
        plt.ylim([0.0, 1.05])
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('Receiver Operating Characteristic curve')
        plt.legend(loc="lower right")
        plt.show()
        return roc_auc

    def _confusion_matrix(self, trueNormal, falseNormal, trueAnomalous, falseAnomalous):
        """
        The _confusion_matrix method computes the 2-class confusion matrix between
        the anomalous and the normal samples.

        :param trueNormal: it is the number of normal samples predicted as normal
        :param falseNormal: it is the number of anomalous samples predicted as
                               normal
        :param trueAnomalous: it is the number of anomalous samples predicted as
                               anomalous
        :param falseAnomalous: it is the number of normal samples predicted as
                               anomalous
        """
        norm_space = "         "
        anomal_space = "         "
        for i in range(6 - len(str(trueNormal))):
            norm_space += " "
        for i in range(6 - len(str(falseNormal))):
            anomal_space += " "
        print("                       Normal       Anomalous")
        print("Predicted Normal        " + str(trueNormal) + norm_space + str(falseAnomalous))
        print("Predicted Anomalous     " + str(falseNormal) + anomal_space + str(trueAnomalous))

    def save(self, filename='IDS'):
        """
        The save method allows to save the classifier model and the settings
        (classifier model, multiclass flag, classes names, classes labels, log
        parameters, index of the normal class and the last used bias for the normal
        score) into a .h5 file (in the case of a DNN) or in a .pkl file (otherwise)
        and a txt file, respectively (eventual files having the same name will be
        replaced).

        :param filename: it is the name of the files (without their extension) in
                         which save the model and the settings ('IDS' by default)
        """
        settings_file = filename + '.txt'
        if os.path.exists(settings_file):
            os.remove(settings_file)

        f = open(settings_file, "a")
        f.write("Model=" + str(self.model_type))
        f.write("\nMulticlass=" + str(self.multiclass))
        f.write("\nNormal=" + str(self.normal_class))
        cn = "Names="
        cl = "Labels="
        flds = "Fields="

        f.write("\n%s" % cn)
        for i in range(len(self.classes_names)):
            f.write("%s " % self.classes_names[i])

        f.write("\n%s" % cl)
        for i in range(len(self.classes)):
            f.write("%s " % str(self.classes[i]))

        f.write("\n%s" % flds)
        for field in self.fields:
            aux_field = field.split()
            if len(aux_field) > 1:
                field = ""
                for a in aux_field:
                    field += a
            f.write("%s " % field)

        f.write("\nBias=%s" % str(self.norm_bias))

        f.close()

        dummy_name = filename + '_dummies.pkl'
        with open(dummy_name, 'wb') as f:
            pickle.dump(self.dummies, f, pickle.HIGHEST_PROTOCOL)

        if self.model_type == 'DNN':
            filename = filename + ".h5"
            self.model.save(filename)
        else:
            filename = filename + ".pkl"
            joblib.dump(self.model, filename)

        print("Dummies values saved in " + dummy_name)
        print("Model saved in " + filename)
        print("Settings saved in " + settings_file)

    def load(self, filename):
        """
        The load method allows to load the classifier model and the settings
        (classes names, classes labels, log parameters and index of the normal
        class) from a .h5 file and a txt file, respectively.

        :param filename: it is the name of the files (without their extension) from
                         which load the model end the settings ('IDS' by default)
        """
        settings_file = filename + '.txt'
        settingFLAG = not (os.path.exists(settings_file))
        h5FLAG = not (os.path.exists(filename + '.h5'))
        pklFLAG = not (os.path.exists(filename + '.pkl'))
        if settingFLAG or (h5FLAG and pklFLAG):
            print("IDS file named " + filename + " not found")
            return

        f = open(settings_file, "r")
        settings = f.readlines()
        for s in settings:
            aux = s.split('=')
            aux = aux[1]
            if "Normal=" in s:
                self.normal_class = int(aux)
            if "Names=" in s:
                self.classes_names = aux.split()
            if "Labels=" in s:
                self.classes = []
                aux_labels = aux.split()
                for idx in aux_labels:
                    self.classes.append(int(idx))
            if "Fields=" in s:
                self.fields = aux.split()
            if "Bias=" in s:
                self.norm_bias = float(aux)
            if "Model=" in s:
                self.model_type = aux
            if "Multiclass=" in s:
                self.multiclass = False
                if aux == 'True':
                    self.multiclass = True
        f.close()

        if self.model_type == 'DNN':
            self.model = load_model(filename + ".h5")
        else:
            self.model = joblib.load(filename + ".pkl")

        with open(filename + '_dummies.pkl', 'rb') as f:
            self.dummies = pickle.load(f)

    def analysis(self, filename, anomalousFLAG=False, n=None, norm_bias=0):
        """
        The analysis method allows to analize a log file, classifying the samples
        belonging to the input file as belonging to the predicted class (the samples
        having missing values will be removed).

        :param filename: it can be a string representing the name of the file (with
                              its path) which has to be analyzed, or the dataframe
        :param anomalousFLAG: it has to be True in order to return the
                              classification related to the only anomalous samples
                              (False by default)
        :param n: it is the number of samples which have to be returned, or None in
                              oder to return all the samples
        :param norm_bias: it is the value to subtract to the normal class score
                              before predicting the classes to which the samples
                              belong, reducing the missed alarms but incrementing
                              the false positive rate if its value is positive (0 by
                              default)

        :return: the dataframe including the predicted labels in the outcome column
        """
        if type(filename) is str:
            aux_df = self.dataset_reader(filename)
        else:
            aux_df = filename.copy()
        aux_df.columns = self.fields
        df = aux_df.copy()

        x = self.compute_dataset(aux_df, analysisFLAG=True)
        if self.model_type == 'DNN':
            pred = self.model.predict(x)
        else:
            pred = self.model.predict_proba(x)

        pred = self._apply_bias(pred, norm_bias)
        y_pred = np.argmax(pred, axis=1)

        labels = list()
        names = self.classes_names
        for idx in y_pred:
            labels.append(names[self.classes.index(idx)])

        df['outcome'] = labels

        if anomalousFLAG is True:
            normal_name = self.classes_names[self.classes.index(self.normal_class)]
            df.drop(df[df['outcome'] == normal_name].index, inplace=True)

        if n is None:
            n = len(labels)

        return df.iloc[0:n, 0:]
result = model.score(X_test, y_test)
print("Accuracy on test data by Logistic Regression with sigmoid kernals is:",
      result)
# ................................................. .....................Logistic Regression by Logistc Regresison

cv_scores = cross_val_score(LR, all_features2, all_classes, cv=10)
print("cv_Score by Logistic Regression with sigmoid kernals is:",
      cv_scores.mean())

#.............................................................Keras Neural Network
from keras.wrappers.scikit_learn import KerasClassifier
from keras.models import Sequential
from keras.layers import Dense, Activation
# def create_model():
model = Sequential()
model.add(Dense(8, input_dim=4, kernel_initializer='normal',
                activation='relu'))
model.add(Dense(4, kernel_initializer='normal', activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy',
              optimizer='rmsprop',
              metrics=['accuracy'])
# return model
history = model.fit(X_train,
                    y_train,
                    batch_size=100,
                    epochs=30,
                    verbose=2,
                    validation_data=(X_test, y_test))
# ............................................................test data error by Keras

score = model.evaluate(X_test, y_test, verbose=0)
Exemple #4
0
# Encode Categorical Variable (y)
from sklearn.preprocessing import OneHotEncoder 
from sklearn.preprocessing import LabelEncoder   
le = LabelEncoder()
ohe = OneHotEncoder()

Y_train_ec = le.fit_transform(Y_train)
Y_train_ec = ohe.fit_transform(Y_train_ec.reshape(-1,1))

Y_validation_ec = le.fit_transform(Y_validation)
Y_validation_ec = ohe.fit_transform(Y_validation_ec.reshape(-1,1))

# Model : INPUT(4) => FC(8) => RELU => FC(8) => SOFTMAX
model = Sequential()
model.add(Dense(10, input_dim=4, init= "uniform" , activation= "relu"))
model.add(Dense(10, init= "uniform" , activation= "relu" ))
model.add(Dense(3, init= "uniform" , activation= "softmax" ))
model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit(X_train, Y_train_ec , nb_epoch=150, batch_size=10)
scores = model.evaluate(X_train, Y_train_ec)
print('Keras accuracy: %.2f' % (scores[1]*100))

model.evaluate(X_train, Y_train_ec)
model.evaluate(X_validation, Y_validation_ec)





classifier = SVC(C=100, kernel='rbf', gamma=0.001, random_state=0)
classifier.fit(X_train, y_train)

#-----------------------------   ANN using Keras   -----------------------------#
# Importing the Keras libraries and packages
import keras
from keras.models import Sequential
from keras.layers import Dense

# Initialising the ANN
classifier = Sequential()
# Adding the input layer and the first hidden layer
classifier.add(
    Dense(units=11,
          kernel_initializer='uniform',
          activation='relu',
          input_dim=22))
# Adding the second hidden layer
classifier.add(Dense(units=11, kernel_initializer='uniform',
                     activation='relu'))
# Adding the output layer
classifier.add(
    Dense(units=1, kernel_initializer='uniform', activation='sigmoid'))
# Compiling the ANN
classifier.compile(optimizer='nadam',
                   loss='binary_crossentropy',
                   metrics=['accuracy'])
# Fitting the ANN to the Training set
classifier.fit(X_train, y_train, batch_size=100, epochs=2000)
Exemple #6
0
lrClassifier = LogisticRegression()
parameters = {'clf__C': [0.001, 0.01, 0.1, 0.25, 0.5, 1, 5, 10], 'clf__max_iter': [100, 200, 300, 400],
              'clf__penalty': ['l2'], 'clf__class_weight': [{'A': 1, 'B': 5, 'C': 10}],
          #    'clf__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']
             }

# 15. Ensemble model - VotingClassifier                     # 0.62
estimators = [('SVM', svClassifier), ('RF', rfClassifier), ('GB', gbClassifier), ('SGD', sgdClassifier), ('AB', abClassifier), ('LR', lrClassifier)]
classifier = VotingClassifier(estimators=estimators, voting='soft')

# 16. Ensemble model - Stacking                             # 0.82
baseLearners = {'SVM': svClassifier, 'RF': rfClassifier, 'GB': gbClassifier, 'SGD': sgdClassifier,'AB': abClassifier, 'LR': lrClassifier}
metaLearner = GradientBoostingClassifier(n_estimators=1000)
classifier = SuperLearner(folds=10, backend="multiprocessing")

classifier.add(list(baseLearners.values()), proba=True)
classifier.add_meta(metaLearner, proba=True)

# 17. XGBoost
classifier = XGBClassifier()
parameters = {'clf__max_depth': [3, 4], 'clf__n_estimators': [100, 200, 500], 'clf__learning_rate': [0.05, 0.1, 0.15]}


#######################################################################
# Use pipeline to apply operations sequentially :-
#######################################################################
text_clf_pipeline = Pipeline([
    ('feature_transforms', FeatureUnion([
        ('title_pipeline', Pipeline([
            ('extract_field', FunctionTransformer(lambda x: x['Title'], validate=False)),
            ('title_tfidf', titleTfIdfVectorizer)
Exemple #7
0
    assert False  #stop here

###############################
####Defining the model#########
###############################

from keras.models import Model, Sequential
from keras.layers import Input, Dense, Conv1D, concatenate, Dropout, Flatten, MaxPooling1D, Reshape
from keras.layers import GlobalMaxPooling1D, GlobalAveragePooling1D
from keras.callbacks import ModelCheckpoint, Callback, LearningRateScheduler
from keras.optimizers import Adam, Nadam, SGD, RMSprop, Adagrad, Adadelta
from keras.initializers import RandomUniform

model = Sequential()
if args.nmer == 3:
    model.add(Dense(256, input_shape=(64, ), activation="relu"))
elif args.nmer == 1:
    model.add(Dense(256, input_shape=(4, ), activation="relu"))
elif args.nmer == 2:
    model.add(Dense(256, input_shape=(16, ), activation="relu"))
elif args.nmer == 4:
    model.add(Dense(256, input_shape=(256, ), activation="relu"))
elif args.nmer == 5:
    model.add(Dense(256, input_shape=(1024, ), activation="relu"))
else:
    assert False
model.add(Dense(256, activation="relu"))
model.add(Dense(256, activation="relu"))

model.add(Dense(1, activation="sigmoid", use_bias=True))
classifier = Sequential()
# Adding the input layer and the first hidden layer
#count_ng1: 5037
#count_ng2: 13374
#count_ng3: 21050
##tfidf1: 5037
##tfidf2: 13374
##tfidf3: 21050
#After preprocessing
#count_ng1: 2604
#count_ng2: 10411
#count_ng3: 17791
##tfidf1: 2604
##tfidf2: 10411
##tfidf3: 17791
classifier.add(
    Dense(output_dim=8, init='uniform', activation='relu', input_dim=17791))
## Adding the second hidden layer
#classifier.add(Dense(output_dim = 8, init = 'uniform', activation = 'relu'))
# Adding the output layer
classifier.add(Dense(output_dim=4, init='uniform', activation='softmax'))
# Compiling the ANN
classifier.compile(optimizer='adam',
                   loss='categorical_crossentropy',
                   metrics=['accuracy'])
# Fitting the ANN to the Training set
classifier.fit(X_train.toarray(),
               Y_train,
               batch_size=20,
               nb_epoch=100,
               verbose=1)
#######Classifer Evaluation ###########
print("acc = ", accuracy_score([0,1,1,0], y_predict))

from keras.models import Sequential
from keras.layers import Dense
from keras.callbacks import EarlyStopping
import numpy as np

x_train = np.array([[0,0], [1,0], [0,1], [1,1]])

from keras.utils import np_utils
# y_train = np_utils.to_categorical(y_train)


# DNN
model = Sequential()
model.add(Dense(32, input_shape=(x_train.shape[1],)))
model.add(Dense(16, activation='relu'))
model.add(Dense(8, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['acc'])
early_stopping = EarlyStopping(monitor='loss', patience=10, mode='auto')
model.fit(x_train, y_train, epochs=100, batch_size=1, callbacks=[early_stopping])

x_test = np.array([[0,0], [1,0], [0,1], [1,1]])

y_predict = model.predict(x_test)
# y_predict = np.argmax(y_predict, axis=1)

y_predict = np.where(y_predict > 0.5, 1, 0).reshape(4,) # 핵심!
final_time = end_time - start_time

results["KNN"] = {"ACC": acc / 5, "Recall": recall / 5, "Time": final_time}

# Evaluate model Neuronal Network Multi Capa
kf = KFold(n_splits=5, shuffle=True)
acc = 0
recall = np.zeros(n_categories)
start_time = time.time()
for train_index, test_index in kf.split(x):
    # Training phase
    x_train = x[train_index, :]
    y_train = y[train_index]

    clf = Sequential()
    clf.add(Dense(8, input_dim=n_features, activation='relu'))
    clf.add(Dense(8, activation='relu'))
    if n_categories == 3:
        clf.add(Dense(3, activation='softmax'))
        y_train = np_utils.to_categorical(y_train)
    elif n_categories == 2:
        clf.add(Dense(1, activation='sigmoid'))
    clf.compile(loss='binary_crossentropy', optimizer='adam')
    clf.fit(x_train, y_train, epochs=150, batch_size=8, verbose=0)

    # Test phase
    x_test = x[test_index, :]
    y_test = y[test_index]
    if n_categories > 2:
        y_pred = np.argmax(clf.predict(x_test), axis=-1)
    elif n_categories == 2:
from keras.layers import Flatten

# In[139]:

from keras.layers import Dense

# In[140]:

import PIL

# In[141]:

classifier = Sequential()
# Input layer
classifier.add(Conv2D(32, (3, 3), input_shape=(64, 64, 3), activation='relu'))

# In[142]:

classifier.add(MaxPooling2D(pool_size=(2, 2)))

# In[143]:

# Hidden layer 1
classifier.add(Conv2D(32, (3, 3), activation='relu'))
classifier.add(MaxPooling2D(pool_size=(2, 2)))

# In[144]:

# Hidden layer 2
accuracy = accuracy_score(y_test, y_pred)

results['XGBoost'] = {'training_time': t, 'accuracy': accuracy}
# Neural Networks
EPOCHS, LAYERS, UNITS = 100, 4, 8
ACTIVATIONS = {
    'leaky_relu': LeakyReLU(),
    'relu': Activation('relu'),
    'selu': Activation('selu'),
    'sigmoid': Activation('sigmoid'),
    'tanh': Activation('tanh')
}

for name, activation in ACTIVATIONS.items():
    model = Sequential()
    model.add(Dense(UNITS, input_dim=4))
    for l in range(LAYERS):
        if l != 0:
            model.add(Dense(UNITS))
        model.add(activation)
    model.add(Dense(3))
    model.add(Activation('softmax'))
    model.compile(optimizer='adam',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])

    def fn():
        model.fit(X_train, y_train_oh, epochs=EPOCHS, verbose=0)
        t = timeit.timeit(fn, number=1)

    accuracy = model.evaluate(X_test, y_test_oh, verbose=0)[1]
Exemple #13
0
#Y_train = keras.utils.to_categorical(Y_train, 4)
onehotencoder = OneHotEncoder(categorical_features=[0])
Y_train = Y_train.reshape((-1, 1))
Y_train = onehotencoder.fit_transform(Y_train).toarray()
labelencoder_Y_test = LabelEncoder()
Y_test = labelencoder_Y_test.fit_transform(Y_test)
onehotencoder = OneHotEncoder(categorical_features=[0])
Y_test = Y_test.reshape((-1, 1))
Y_test = onehotencoder.fit_transform(Y_test).toarray()

# Initialising the ANN
classifier = Sequential()
# Adding the input layer and the first hidden layer
#tfidf_ng1: 100804 for balanced and  for unbalanced
classifier.add(
    Dense(output_dim=8, init='uniform', activation='relu',
          input_dim=100804))  ##16455 for balanced and 33350 for unbalanced
## Adding the second hidden layer
#classifier.add(Dense(output_dim = 8, init = 'uniform', activation = 'relu'))
# Adding the output layer
classifier.add(Dense(output_dim=5, init='uniform', activation='softmax'))
# Compiling the ANN
classifier.compile(optimizer='adam',
                   loss='categorical_crossentropy',
                   metrics=['accuracy'])
# Fitting the ANN to the Training set
classifier.fit(X_train.toarray(),
               Y_train,
               batch_size=20,
               nb_epoch=100,
               verbose=1)
Exemple #14
0
    print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))
# Compare Algorithms
plt.boxplot(results, labels=names)
plt.title('Algorithm Comparison')
plt.show()

#using RandomForestClassifier
classifier = KNeighborsClassifier(n_neighbors=2)
classifier.fit(X_train, y_train)

# STarting with ANN
import keras
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout
'''
classifier = Sequential()
#Starting with the first layer and hidden layer
classifier.add(Dense(units = 3,kernel_initializer = 'uniform',activation = 'relu',input_dim = 5))

classifier.add(Dense(units = 3, kernel_initializer = 'uniform',activation = 'relu'))
classifier.add(Dense(units = 3, kernel_initializer = 'uniform',activation = 'relu'))
classifier.add(Dense(units = 3, kernel_initializer = 'uniform',activation = 'relu'))
#classifier.add(Dense(units = 3, kernel_initializer = 'uniform',activation = 'relu'))


classifier.add(Dense(units = 1,kernel_initializer = 'uniform',activation = 'sigmoid'))
#Compliling the NN
classifier.compile(optimizer = 'adam',loss = 'binary_crossentropy', metrics = ['accuracy'])

#Fitting the data to the ANN
Exemple #15
0
# Fitting K-NN to the Training set
from sklearn.neighbors import KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5, metric='minkowski', p=2)
classifier.fit(X, y)

# Importing the Keras libraries and packages
import keras
from keras.models import Sequential
from keras.layers import Dense

# Initialising the ANN
classifier = Sequential()

# Adding the input layer and the first hidden layer
classifier.add(
    Dense(output_dim=7, init='uniform', activation='relu', input_dim=12))

# Adding the second hidden layer
classifier.add(Dense(output_dim=7, init='uniform', activation='relu'))

# Adding the output layer
classifier.add(Dense(output_dim=1, init='uniform', activation='sigmoid'))

# Compiling the ANN
classifier.compile(optimizer='adam',
                   loss='binary_crossentropy',
                   metrics=['accuracy'])

# Fitting the ANN to the Training set
classifier.fit(X, y, batch_size=10, nb_epoch=100)
Exemple #16
0
classifier = Sequential()
# Adding the input layer and the first hidden layer
#tfidf(1,1)-->3822 for balanced and 9267 for unbalanced
#tfidf(1,2)-->10835 for balanced and 29543 for unbalanced
#tfidf(1,3)--> 18632 for balanced and 53131 for unbalanced
#count(1,1)--> 3822 for balanced and 9267 for unbalanced
#count(1,2)--> 10835 for balanced and 29543 for unbalanced
#count(1,3)--> 18632 for balanced and 53131 for unbalanced
#####after preprocessing
#count(1,1)-->3060  for balanced and 7486 for unbalanced
#count(1,2)--> 8563 for balanced and 23894 for unbalanced
#count(1,3)--> 14410 for balanced and 42258 for unbalanced
#tfidf(1,1)--> 3060 for balanced and 7486 for unbalanced
#tfidf(1,2)--> 8563 for balanced and 23894 for unbalanced
#tfidf(1,3)--> 14410 for balanced and 42258 for unbalanced
classifier.add(
    Dense(output_dim=8, init='uniform', activation='relu', input_dim=42258))
## Adding the second hidden layer
#classifier.add(Dense(output_dim = 8, init = 'uniform', activation = 'relu'))
# Adding the output layer
classifier.add(Dense(output_dim=3, init='uniform', activation='softmax'))
# Compiling the ANN
classifier.compile(optimizer='adam',
                   loss='categorical_crossentropy',
                   metrics=['accuracy'])
# Fitting the ANN to the Training set
classifier.fit(X_train.toarray(),
               Y_train,
               batch_size=20,
               nb_epoch=100,
               verbose=1)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
from sklearn.metrics import accuracy_score    
Accuracy=accuracy_score(y_test, y_pred, normalize=True, sample_weight=None)
print(Accuracy*100,'%')

"""Deep Learning"""

X_train, X_test, y_train, y_test = train_test_split(X, y_onehot, test_size = 0.35)

from keras import layers
from keras.layers import Dense
from keras.models import Sequential
from keras.regularizers import l2
classifier = Sequential()
classifier.add(Dense(units = 2048, activation = 'relu',input_shape=(3285,)))
classifier.add(layers.Dropout(0.2))
classifier.add(Dense(units = 1024, activation = 'relu'))
classifier.add(layers.Dropout(0.2))
classifier.add(Dense(units = 512, activation = 'relu'))
#classifier.add(layers.Dropout(0.3))
classifier.add(Dense(units = 256, activation = 'relu'))
classifier.add(Dense(units = 128, activation = 'relu'))
classifier.add(Dense(units = 64, activation = 'relu'))
classifier.add(Dense(units = 36, activation = 'softmax'))
classifier.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
history = classifier.fit(x=X_train, y=y_train, batch_size=64, epochs=100, verbose=1, callbacks=None, validation_split=0.0, validation_data=(X_test,y_test), shuffle=True, class_weight=None, sample_weight=None, initial_epoch=0, steps_per_epoch=None, validation_steps=None)
y_pred = classifier.predict(X_test)

"""Training full data"""