Example #1
0
def dataSetGenKdd99(log):
    kddNormalizeMap = list(range(40))
    kddNormalizeMap[1:3] = {}, {}, {}

    def kddNormalize(kddArr):
        # [0 b'tcp' b'http' b'SF' ...
        result = []
        for i, kddMap, kddEntry in zip(range(len(kddArr)), kddNormalizeMap,
                                       kddArr):
            if i == 0 or i >= 4:
                result.append(float(kddEntry))
                continue
            if not kddEntry in kddMap:
                kddMap[kddEntry] = len(kddMap)
            result.append(float(kddMap[kddEntry]))
        return result

    from sklearn.datasets import fetch_kddcup99
    kddcup99 = fetch_kddcup99()
    # kddcup99.data.shape Out[2]: (494021, 41) 494 021
    log.info(f'Dataset len {kddcup99.data.shape}')
    allData = list()
    for data, target in zip(kddcup99.data, kddcup99.target):
        data = kddNormalize(data)
        data = [float(i) for i in data]
        allData.append((data, target.decode(encoding='utf-8')))
    #
    return allData
    def _tabular_data(self):

        kddcup99_all_data = fetch_kddcup99()
        feature_names = [
            'duration', 'protocol_type', 'service', 'flag', 'src_bytes',
            'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
            'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
            'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
            'num_access_files', 'num_outbound_cmds', 'is_host_login',
            'is_guest_login', 'count', 'srv_count', 'serror_rate',
            'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate',
            'same_srv_rate', 'diff_srv_rate', 'srv_diff_host_rate',
            'dst_host_count', 'dst_host_srv_count', 'dst_host_same_srv_rate',
            'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
            'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
            'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
            'dst_host_srv_rerror_rate'
        ]
        tabular_data_set = pd.DataFrame.from_dict(kddcup99_all_data['data'])
        tabular_data_set.columns = feature_names

        tabular_data_set = tabular_data_set.drop_duplicates(
            subset=feature_names, keep='first', inplace=True)
        traget = kddcup99_all_data['target']

        new_tabular_data_set, traget = self._sub_sampling(
            tabular_data_set, traget)
        list_of_columns = ['protocol_type', 'service', 'flag']
        X = self._set_tabular_df(data_to_work_on=new_tabular_data_set,
                                 list_of_columns=list_of_columns)
        return self._split_train_and_test_tabular_data(
            X, *self._create_value_mapping_for_target_tabular_data(traget))
    def __init__(self, subset="SF", percent10=False):
        dataset = datasets.fetch_kddcup99(subset=subset, percent10=percent10)
        if subset == "SA":
            columns = sa_columns
            toDecode = toDecodeSA
        else:
            toDecode = toDecodeSF
            columns = sf_columns
        self.df = pd.DataFrame(dataset.data, columns=columns)
        assert len(self.df) > 0, f"{subset} dataset not loaded."
        self.df["target"] = dataset.target
        anomaly_rate = 1.0 - len(
            self.df.loc[self.df["target"] == b'normal.']) / len(self.df)
        print(f"SA anomaly rate is {anomaly_rate:.1%}")

        self.df["binary_target"] = [
            1 if x == b'normal.' else -1 for x in self.df["target"]
        ]
        le = preprocessing.LabelEncoder()
        for f in toDecode:
            self.df[f] = list(map(byte_decoder, self.df[f]))
            self.df[f] = le.fit_transform(self.df[f])

        a, b, c, d = train_test_split(self.df.drop(["target", "binary_target"],
                                                   axis=1),
                                      self.df["binary_target"],
                                      test_size=0.33,
                                      random_state=0)
        self.x_train = a
        self.x_test = b
        self.y_train = c
        self.y_test = d

        self.estimators = Estimators()
def main():
    np.random.seed(20)

    def scorer(est, x, y):
        y_hat = est.predict(x)
        return classification.accuracy_score(y, y_hat)

    #x, y = make_classification(n_samples=1000, n_classes=4, n_informative=10)
    x, y = fetch_kddcup99(return_X_y=True)
    x = np.array(x[:, 4:], dtype=np.float32)
    y = preprocessing.LabelEncoder().fit_transform(y)
    # X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2)
    myclf = Mree(split_method=greedy_classification)
    score = cross_val_score(myclf, x, y, cv=5, scoring=scorer)
    print("Mine greedy classification result", score, np.mean(score))
    myclf = Mree(split_method=greedy_classification_p_at_k)
    score = cross_val_score(myclf, x, y, cv=5, scoring=scorer)
    print("Mine greedy p@k classification result", score, np.mean(score))

    myclf = Mree(split_method=random_classify_p_at_k)
    score = cross_val_score(myclf, x, y, cv=5, scoring=scorer)
    print("Mine random p@k classification result", score, np.mean(score))

    clf = DecisionTreeClassifier(max_depth=10,
                                 max_features=20,
                                 min_impurity_decrease=0.000001)
    score = cross_val_score(clf, x, y, cv=5, scoring=scorer)
    print("Sklearn greedy classification result", score, np.mean(score))
    clf = ExtraTreeClassifier(max_depth=10,
                              max_features=20,
                              min_impurity_decrease=0.000001)
    score = cross_val_score(clf, x, y, cv=5, scoring=scorer)
    print("sklearn random classification result", score, np.mean(score))
Example #5
0
def get_dataset(name):
    from sklearn.preprocessing import scale
    data = []
    if name == "cancer":
        from sklearn.datasets import load_breast_cancer
        dataset = load_breast_cancer()
    elif name == "digits":
        from sklearn.datasets import load_digits
        dataset = load_digits()
    elif name == "iris":
        from sklearn.datasets import load_iris
        dataset = load_iris()
    elif name == "boston":
        from sklearn.datasets import load_boston
        dataset = load_boston()
    elif name == "KDD":
        from sklearn.datasets import fetch_kddcup99
        dataset = fetch_kddcup99(subset='SF')
        data = dataset.data[:2000, [0, 2, 3]]
    else:
        print("Unknown name of dataset")
        exit(-1)

    labels = dataset.target
    if data == []:
        data = scale(dataset.data)
        n_samples, n_features = data.shape
        n_elements = len(unique(labels))
    return data, n_elements, labels, len(set(labels))
Example #6
0
def test_shuffle():
    try:
        dataset = fetch_kddcup99(random_state=0, subset='SA', shuffle=True,
                                 percent10=True, download_if_missing=False)
    except IOError:
        raise SkipTest("kddcup99 dataset can not be loaded.")

    assert(any(dataset.target[-100:] == b'normal.'))
def test_shuffle():
    try:
        dataset = fetch_kddcup99(random_state=0, subset='SA', shuffle=True,
                                 percent10=True, download_if_missing=False)
    except IOError:
        raise SkipTest("kddcup99 dataset can not be loaded.")

    assert(any(dataset.target[-100:] == b'normal.'))
Example #8
0
    def test_Kddcup99Numba():
        from sklearn.datasets import fetch_kddcup99
        kddcup99 = fetch_kddcup99()
        total = len(kddcup99.data)
        online = 442800
        offline = 48791
        # total - online - offline

        print(kddcup99.data[0])
        kddNormalizeMap = list(range(40))
        kddNormalizeMap[1:3] = {}, {}, {}

        def kddNormalize(kddArr):
            # [0 b'tcp' b'http' b'SF' ...
            result = []
            for i, kddMap, kddEntry in zip(range(len(kddArr)), kddNormalizeMap,
                                           kddArr):
                if i == 0 or i >= 4:
                    result.append(float(kddEntry))
                    continue
                if not kddEntry in kddMap:
                    kddMap[kddEntry] = len(kddMap)
                result.append(float(kddMap[kddEntry]))
            return np.array(result)

        tenPercent = (total // 10)
        baseMapKddcup99 = []
        for data, target in zip(kddcup99.data[:tenPercent],
                                kddcup99.target[:tenPercent]):
            baseMapKddcup99.append({
                'item': kddNormalize(data),
                'label': str(target)
            })
        trainingDF = pd.DataFrame(baseMapKddcup99)
        print(trainingDF.head())
        init = time.time()
        clusters = minasOffline(trainingDF)
        print(
            f'minasOffline(testKddcup99Numba) => {len(clusters)}, {time.time() - init} seconds'
        )
        labels = []
        for cl in clusters:
            if not cl.label in labels:
                print(cl)
                labels.append(cl.label)
        print('\n')

        minasOnline
        allZip = zip(map(kddNormalize, kddcup99.data[tenPercent + 1:]),
                     map(str, kddcup99.target[tenPercent + 1:]))
        inputStream = (Example(item=i, label=t) for i, t in allZip)
        init = time.time()
        for o in metaMinas(
                minasOnline(inputStream, clusters, minDist=minDistNumba)):
            print(o)
        print(
            f'metaMinas(minasOnline(testKddcup99Numba) {time.time() - init} seconds'
        )
def load_dataset():
    target = 'target'
    sf = datasets.fetch_kddcup99(subset='SF', percent10=False)
    dfSF = pd.DataFrame(
        sf.data, columns=["duration", "service", "src_bytes", "dst_bytes"])
    assert len(dfSF) > 0, "SF dataset no loaded."

    dfSF[target] = sf.target
    return target, dfSF
Example #10
0
def fetch(dataset='http', fetch_percent_10=True):
    if _Debug == True:
        pdb.set_trace()
    raw_data = fetch_kddcup99(subset=dataset, percent10=fetch_percent_10)
    unique_data, index = np.unique(raw_data.data.astype(float),
                                   axis=0,
                                   return_index=True)
    scaler_data = StandardScaler().fit_transform(raw_data.data[index])
    scaler_data = MinMaxScaler().fit_transform(scaler_data)
    return scaler_data, raw_data.target[index]
Example #11
0
def prepare_kddcup():
    x, y = fetch_kddcup99(return_X_y=True)
    y = preprocessing.LabelEncoder().fit_transform(y)
    nominal_cols = [1, 2, 3]
    cv = StratifiedShuffleSplit(n_splits=5, test_size=0.3, random_state=0)
    for n, (train_index, test_index) in enumerate(cv.split(x, y)):
        X_train = x[train_index]
        X_test = x[test_index]
        y_train = y[train_index]
        y_test = y[test_index]
        yield n, X_train, y_train, X_test, y_test, nominal_cols
Example #12
0
def load_kddcup99():
    X, y = fetch_kddcup99(shuffle=1, return_X_y=True,
                          percent10=False)
    categorical_features = [1, 2, 3]
    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
        ('onehot', OneHotEncoder(handle_unknown='ignore'))])

    preprocessor = ColumnTransformer(
        transformers=[
            ('cat', categorical_transformer, categorical_features)])
    return preprocessor.fit_transform(X), LabelEncoder().fit_transform(y)
Example #13
0
def test_percent10():
    try:
        data = fetch_kddcup99(download_if_missing=False)
    except IOError:
        raise SkipTest("kddcup99 dataset can not be loaded.")

    assert data.data.shape == (494021, 41)
    assert data.target.shape == (494021,)

    data_shuffled = fetch_kddcup99(shuffle=True, random_state=0)
    assert data.data.shape == data_shuffled.data.shape
    assert data.target.shape == data_shuffled.target.shape

    data = fetch_kddcup99('SA')
    assert data.data.shape == (100655, 41)
    assert data.target.shape == (100655,)

    data = fetch_kddcup99('SF')
    assert data.data.shape == (73237, 4)
    assert data.target.shape == (73237,)

    data = fetch_kddcup99('http')
    assert data.data.shape == (58725, 3)
    assert data.target.shape == (58725,)

    data = fetch_kddcup99('smtp')
    assert data.data.shape == (9571, 3)
    assert data.target.shape == (9571,)

    fetch_func = partial(fetch_kddcup99, 'smtp')
    check_return_X_y(data, fetch_func)
Example #14
0
def test_percent10():
    try:
        data = fetch_kddcup99(download_if_missing=False)
    except IOError as e:
        if e.errno == errno.ENOENT:
            raise SkipTest("kddcup99 dataset can not be loaded.")

    assert_equal(data.data.shape, (494021, 41))
    assert_equal(data.target.shape, (494021,))

    data_shuffled = fetch_kddcup99(shuffle=True, random_state=0)
    assert_equal(data.data.shape, data_shuffled.data.shape)
    assert_equal(data.target.shape, data_shuffled.target.shape)

    data = fetch_kddcup99('SA')
    assert_equal(data.data.shape, (100655, 41))
    assert_equal(data.target.shape, (100655,))

    data = fetch_kddcup99('SF')
    assert_equal(data.data.shape, (73237, 4))
    assert_equal(data.target.shape, (73237,))

    data = fetch_kddcup99('http')
    assert_equal(data.data.shape, (58725, 3))
    assert_equal(data.target.shape, (58725,))

    data = fetch_kddcup99('smtp')
    assert_equal(data.data.shape, (9571, 3))
    assert_equal(data.target.shape, (9571,))
def test_percent10():
    try:
        data = fetch_kddcup99(download_if_missing=False)
    except IOError:
        raise SkipTest("kddcup99 dataset can not be loaded.")

    assert_equal(data.data.shape, (494021, 41))
    assert_equal(data.target.shape, (494021,))

    data_shuffled = fetch_kddcup99(shuffle=True, random_state=0)
    assert_equal(data.data.shape, data_shuffled.data.shape)
    assert_equal(data.target.shape, data_shuffled.target.shape)

    data = fetch_kddcup99('SA')
    assert_equal(data.data.shape, (100655, 41))
    assert_equal(data.target.shape, (100655,))

    data = fetch_kddcup99('SF')
    assert_equal(data.data.shape, (73237, 4))
    assert_equal(data.target.shape, (73237,))

    data = fetch_kddcup99('http')
    assert_equal(data.data.shape, (58725, 3))
    assert_equal(data.target.shape, (58725,))

    data = fetch_kddcup99('smtp')
    assert_equal(data.data.shape, (9571, 3))
    assert_equal(data.target.shape, (9571,))

    fetch_func = partial(fetch_kddcup99, 'smtp')
    check_return_X_y(data, fetch_func)
Example #16
0
def test_percent10():
    try:
        data = fetch_kddcup99(download_if_missing=False)
    except IOError:
        raise SkipTest("kddcup99 dataset can not be loaded.")

    assert_equal(data.data.shape, (494021, 41))
    assert_equal(data.target.shape, (494021,))

    data_shuffled = fetch_kddcup99(shuffle=True, random_state=0)
    assert_equal(data.data.shape, data_shuffled.data.shape)
    assert_equal(data.target.shape, data_shuffled.target.shape)

    data = fetch_kddcup99('SA')
    assert_equal(data.data.shape, (100655, 41))
    assert_equal(data.target.shape, (100655,))

    data = fetch_kddcup99('SF')
    assert_equal(data.data.shape, (73237, 4))
    assert_equal(data.target.shape, (73237,))

    data = fetch_kddcup99('http')
    assert_equal(data.data.shape, (58725, 3))
    assert_equal(data.target.shape, (58725,))

    data = fetch_kddcup99('smtp')
    assert_equal(data.data.shape, (9571, 3))
    assert_equal(data.target.shape, (9571,))
Example #17
0
File: tools.py Project: ghl3/gtrees
def make_kddcup(n_samples):

    features, targets = datasets.fetch_kddcup99(subset='smtp')

    features = pd.DataFrame(
        features,
        columns=['feature_{}'.format(i) for i in range(features.shape[1])],
        dtype=np.float32)
    targets = pd.Series(targets, name='target', dtype=np.float32)
    targets = targets.map(lambda x: 1.0 if x > 0 else 0.0)

    features = featurse.sample(n=n_samples)

    return features, targets.loc[features.index]
Example #18
0
def kddcup(percent10, random_state=1):
    data = fetch_kddcup99(percent10=percent10)

    x = data.data
    y_ori = data.target
    y = np.array([1 if l == b'normal.' else -1 for l in y_ori])
    labelencoder_x_1 = LabelEncoder()
    labelencoder_x_2 = LabelEncoder()
    labelencoder_x_3 = LabelEncoder()
    x[:, 1] = labelencoder_x_1.fit_transform(x[:, 1])
    x[:, 2] = labelencoder_x_2.fit_transform(x[:, 2])
    x[:, 3] = labelencoder_x_3.fit_transform(x[:, 3])
    onehotencoder_1 = OneHotEncoder(categorical_features=[1])
    x = onehotencoder_1.fit_transform(x).toarray()
    onehotencoder_2 = OneHotEncoder(categorical_features=[4])
    x = onehotencoder_2.fit_transform(x).toarray()
    onehotencoder_3 = OneHotEncoder(categorical_features=[70])
    x = onehotencoder_3.fit_transform(x).toarray()

    normal = x[np.where(y == 1)]
    anomalies = x[np.where(y == -1)]
    anomalies = shuffle(anomalies, random_state=1)
    anomalies = anomalies[:int(len(normal)/19)]

    scaler = MinMaxScaler()
    scaler.fit(np.concatenate((normal, anomalies), axis=0))
    normal = scaler.transform(normal)
    anomalies = scaler.transform(anomalies)

    x = np.concatenate((normal, anomalies), axis=0)
    y = np.concatenate(([1] * len(normal), [-1] * len(anomalies)), axis=0)
    x, y = shuffle(x, y, random_state=random_state)

    normal = x[np.where(y == 1)]
    test_normal = normal[int(len(normal) / 2):]
    normal = normal[:int(len(normal) / 2)]

    anomalies = x[np.where(y == -1)]
    test_anomalies = anomalies[int(len(anomalies) / 2):]
    anomalies = anomalies[:int(len(anomalies) / 2)]

    x_train = np.concatenate((normal, anomalies), axis=0)
    y_train = np.concatenate(([1] * len(normal), [-1] * len(anomalies)), axis=0)
    x_train, y_train = shuffle(x_train, y_train, random_state=1)

    x_test = np.concatenate((test_normal, test_anomalies), axis=0)
    y_test = np.concatenate(([1] * len(test_normal), [-1] * len(test_anomalies)), axis=0)
    x_test, y_test = shuffle(x_test, y_test, random_state=1)

    return x_train, y_train, x_test, y_test
Example #19
0
def load_kddcup99():
    X, y = fetch_kddcup99(shuffle=True, return_X_y=True)
    df_X = pd.DataFrame(X)
    X = pd.get_dummies(df_X,
                       columns=[1, 2, 3],
                       prefix=['protocol_type', "service",
                               "flag"]).values.astype(np.float32)
    max_by_col = np.max(X, axis=0)
    min_by_col = np.min(X, axis=0)
    X = (X - min_by_col) / (max_by_col - min_by_col)
    X = X[:, ~np.any(np.isnan(X), axis=0)]
    label_encoder = preprocessing.LabelEncoder()
    y = label_encoder.fit_transform(y.reshape(-1, 1))
    return X, y
def load_dataset():
    target = 'target'
    sf = datasets.fetch_kddcup99(subset='SF', percent10=False)
    dfSF = pd.DataFrame(
        sf.data, columns=["duration", "service", "src_bytes", "dst_bytes"])
    assert len(dfSF) > 0, "SF dataset no loaded."

    dfSF[target] = sf.target
    anomaly_rateSF = 1.0 - len(
        dfSF.loc[dfSF[target] == b'normal.']) / len(dfSF)

    # 计算数据集数量
    print("kddcup长度:", len(dfSF))
    # 计算真实异常率
    print("SF Anomaly Rate is:" + "{:.1%}".format(anomaly_rateSF))
    return target, dfSF
Example #21
0
File: Model.py Project: bjjfcbj/ml
 def __init__(self, batch_size):
     kddcup99 = datasets.fetch_kddcup99()
     self._encoder = {
         'protocal': LabelEncoder(),
         'service':  LabelEncoder(),
         'flag':     LabelEncoder(),
         'label':    LabelEncoder()
     }
     self.batch_size = batch_size
     data_X, data_y = self.__encode_data(kddcup99.data, kddcup99.target)
     self.train_dataset, self.test_dataset = self.__split_data_to_tensor(
         data_X, data_y)
     self.train_dataloader = DataLoader(
         self.train_dataset, self.batch_size, shuffle=True)
     self.test_dataloader = DataLoader(
         self.test_dataset, self.batch_size, shuffle=True)
Example #22
0
def load_train_test_data(
    small: bool, train_normal_only: bool
) -> Tuple[Tuple[pd.DataFrame, np.ndarray], Tuple[pd.DataFrame, np.ndarray]]:
    X, y = fetch_kddcup99(subset='SA', percent10=small, return_X_y=True)
    columns = [
        "duration", "protocol_type", "service", "flag", "src_bytes",
        "dst_bytes", "land", "wrong_fragment", "urgent", "hot",
        "num_failed_logins", "logged_in", "num_compromised", "root_shell",
        "su_attempted", "num_root", "num_file_creations", "num_shells",
        "num_access_files", "num_outbound_cmds", "is_host_login",
        "is_guest_login", "count", "srv_count", "serror_rate",
        "srv_serror_rate", "rerror_rate", "srv_rerror_rate", "same_srv_rate",
        "diff_srv_rate", "srv_diff_host_rate", "dst_host_count",
        "dst_host_srv_count", "dst_host_same_srv_rate",
        "dst_host_diff_srv_rate", "dst_host_same_src_port_rate",
        "dst_host_srv_diff_host_rate", "dst_host_serror_rate",
        "dst_host_srv_serror_rate", "dst_host_rerror_rate",
        "dst_host_srv_rerror_rate"
    ]
    categorical_columns = ["protocol_type", "flag", "service"]
    features = pd.DataFrame(X, columns=columns)
    target = (y == b'normal.') * 1
    for categorical_column in categorical_columns:
        features[categorical_column] = features[categorical_column].astype(
            'category')
    number_anomalies = np.sum(1 - target)
    number_test_samples = 2 * number_anomalies
    if train_normal_only:
        features_train, features_test = features.iloc[:
                                                      -number_test_samples], features.iloc[
                                                          -number_test_samples:]
        target_train, target_test = target[:-number_test_samples], target[
            -number_test_samples:]
    else:
        test_indices = np.random.choice(a=range(len(features)),
                                        size=number_test_samples,
                                        replace=False)
        features_train, features_test = features.drop(
            test_indices), features.loc[test_indices]
        target_train, target_test = np.delete(
            target, test_indices), target[test_indices]
    return (features_train, target_train), (features_test, target_test)


# features, target= load_train_test_data(small=True, train_normal_only=True)

# print(features.columns)
Example #23
0
 def test_closs_validation(self):
     trainer = Trainer()
     kf = KFold(n_splits=3)
     self.features, self.labels = fetch_kddcup99(subset="http",
                                                 return_X_y=True)
     self.labels = list(
         map(lambda label: 0 if label == b"normal." else 1, self.labels))
     self.labels = np.array(self.labels)
     for train_index, test_index in kf.split(self.features, self.labels):
         train_data = self.features[train_index]
         test_data = self.features[test_index]
         train_label = self.labels[train_index]
         test_label = self.labels[test_index]
         trainer.train(train_data)
         result = trainer.model.predict(test_data)
         accuracy = accuracy_score(test_label, result)
         print("正解率=", accuracy)
         assert accuracy > 0.8
Example #24
0
def readKDD99(config):
    info("Getting KDD '99 data.")

    datadir = getDataDir(config)
    outputConfig = config['output']
    compress = outputConfig['compress']
    dataName = setFile(datadir, outputConfig['name'])

    featureConfig = config['feature']
    dlFile = setFile(datadir, featureConfig['dropList'])

    if isFile(dataName) and isFile(dlFile):
        info("Loading previously create data frames")
        pddf = getJoblib(dataName)
    else:
        info("Downloading KDD '99 data", ind=2)
        tmp = datasets.fetch_kddcup99()
        X = tmp['data']
        y = tmp['target']
        y = y.reshape((y.shape[0], 1))
        pddf = DataFrame(append(arr=X, values=y, axis=1))

        tmp = pddf.head(n=1000)
        for column in tmp.columns:
            try:
                tmp[column].mean()
                pddf[column] = to_numeric(pddf[column], errors="coerce")
            except:
                continue

        colFile = setFile(datadir, "names.dat")
        colnames = open(colFile).readlines()
        targets = colnames[0].split(",")
        columns = [x.split(":")[0] for x in colnames[1:]]
        columns.append("TARGET")
        pddf.columns = columns

        info("Saving data to {0}".format(dataName))
        saveJoblib(jlfile=dataName, jldata=pddf, compress=compress)

        info("Saving feature data to {0}".format(dlFile))
        writeDropList(dlFile, pddf, dlData=None)

    return pddf
Example #25
0
def get_kddcup99_sf():
    X, y = fetch_kddcup99(subset='SF', random_state=42, return_X_y=True)
    lb = LabelBinarizer()
    x1 = lb.fit_transform(X[:, 1].astype(str))
    X = np.c_[X[:, :1], x1, X[:, 2:]]
    y = y.astype('str')
    y_df = pd.DataFrame(y, columns=['class'])
    y_df.loc[y_df['class'] != 'normal.', 'class'] = -1
    y_df.loc[y_df['class'] == 'normal.', 'class'] = 1
    y = y_df.values

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=42)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    return X_train, X_test, y_train, y_test, 'kddcup99_sf'
Example #26
0
def get_kddcup99_http():
    # fetch data
    X, y = fetch_kddcup99(subset='http', random_state=42, return_X_y=True)
    X, y = X.astype(np.float32), y.astype('str')

    # fix classes
    y_df = pd.DataFrame(y, columns=['class'])
    y_df.loc[y_df['class'] != 'normal.', 'class'] = -1
    y_df.loc[y_df['class'] == 'normal.', 'class'] = 1
    y = y_df.values

    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        random_state=42)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    return X_train, X_test, y_train, y_test, 'kddcup99_http'
Example #27
0
def get_dataset(name):
    from sklearn.preprocessing import scale
    data = []
    if name == "cancer":
        from sklearn.datasets import load_breast_cancer
        dataset = load_breast_cancer()
    elif name == "digits":
        from sklearn.datasets import load_digits
        dataset = load_digits()
    elif name == "iris":
        from sklearn.datasets import load_iris
        dataset = load_iris()
    elif name == "boston":
        from sklearn.datasets import load_boston
        dataset = load_boston()
    elif name == "KDD":
        from sklearn.datasets import fetch_kddcup99
        dataset = fetch_kddcup99(subset='SF')
        data = dataset.data[:2000, [0, 2, 3]]
    elif name == "newsgroup":
        from sklearn.feature_extraction.text import TfidfVectorizer
        from sklearn.datasets import fetch_20newsgroups
        dataset = fetch_20newsgroups(subset='train')
        vectorizer = TfidfVectorizer()
        data = vectorizer.fit_transform(dataset.data)
        #    data = vectors.nnz / float(vectors.shape[0])
        labels = dataset.target
        return data.toarray(), 1, labels, len(set(labels))
    else:
        print("Unknown name of dataset")
        exit(-1)

    labels = dataset.target
    if data == []:
        data = scale(dataset.data)
    n_samples, n_features = data.shape
    #n_elements = len(unique(labels))
    return data, 1, labels, len(set(labels))
    def X_y_dataset(self, remove_duplicates: bool = False, full_dataset: bool = True, force: bool = False) -> np.array:
        """
        Helper function to create the dataset, including the dependant "target" variable.

        :param remove_duplicates: Flag to decide whether duplicates should be reduced using Dataframe.drop_duplicates
        :param full_dataset: Flag to decide if full dataset or only 10% should be retrieved.
        :param force: Flag to force re-retrieval of X and y from source or used locally stored (X, y) from previous call.
        :return: The dataset as (X, y).
        """
        # Lazy init
        if self._X is None or self._y is None or force is True:

            logger.info(f"Step  - Only 10% of Dataset: {(not full_dataset)}")
            data, target = fetch_kddcup99(return_X_y=True, percent10=(not full_dataset), random_state=RANDOM_STATE)

            target = np.array(target).reshape(-1, 1)

            self._X = pd.DataFrame(data=data, columns=self.label_manager.X_column_names)
            self._y = pd.DataFrame(data=target, columns=self.label_manager.y_column_name)

            if remove_duplicates:
                self._remove_duplicate_rows()

        return self._X, self._y
from sklearn.utils import shuffle as sh

print(__doc__)

np.random.seed(2)

# datasets available: ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
datasets = ['shuttle']

novelty_detection = True  # if False, training set polluted by outliers

for dataset_name in datasets:
    # loading and vectorization
    print('loading data')
    if dataset_name in ['http', 'smtp', 'SA', 'SF']:
        dataset = fetch_kddcup99(subset=dataset_name, shuffle=True,
                                 percent10=False)
        X = dataset.data
        y = dataset.target

    if dataset_name == 'shuttle':
        dataset = fetch_mldata('shuttle')
        X = dataset.data
        y = dataset.target
        X, y = sh(X, y)
        # we remove data with label 4
        # normal data are then those of class 1
        s = (y != 4)
        X = X[s, :]
        y = y[s]
        y = (y != 1).astype(int)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.datasets import fetch_kddcup99
import numpy as np
import random

random_seed = 42
random.seed(random_seed)
np.random.seed(random_seed)
dataset_name = "Digits"

#%% Generate classes

print('Generating classes')

x, y = fetch_kddcup99(return_X_y=True, subset='http', percent10=True)
x_train, x_test, y_train, y_test = train_test_split(x,
                                                    y,
                                                    test_size=0.4,
                                                    random_state=random_seed)

#%% Perceptron

#TODO: Inserir Perceptron pra mostrar separação ideal

#%% Define model for classification

print('Building model')
# Do not interrupt the training before end of the epochs, to force an
# overfitting
Example #31
0
def load_data(scale_data=False,
              transform_data=False,
              random_slice=None,
              random_seed=None,
              dataset='breast_cancer'):

    if random_seed is not None:
        np.random.seed(random_seed)

    if dataset == 'breast_cancer':
        data = datasets.load_breast_cancer()
    elif dataset == 'kdd':
        data = datasets.fetch_kddcup99()
    #data = datasets.fetch_covtype()

    X = data.data

    Y = data.target

    #np.savetxt("/home/btodorov/Desktop/foo.csv", X[np.random.choice(Y.shape[0], 1000, replace=False), :], delimiter=",")

    ten_random_records = np.random.choice(Y.shape[0], 10, replace=False)
    # print(X[ten_random_records, :])
    # print('-----------------------------------------------')
    # print(Y[ten_random_records])
    # print('-----------------------------------------------')
    print('X.shape: ', X.shape)
    print('Y.shape: ', Y.shape)
    print('-----------------------------------------------')
    if random_slice is not None:
        random_indices = np.random.choice(
            Y.shape[0],
            random_slice if random_slice < Y.shape[0] else Y.shape[0],
            replace=False)
        X = X[random_indices, :]
        Y = Y[random_indices]

    if transform_data:
        for i in [1, 2, 3]:
            print(X[0, i])
            le = preprocessing.LabelEncoder()
            le.fit(X[:, i])
            X[:, i] = le.transform(X[:, i])
            print('Min-Max {0}: {1}-{2}'.format(i, np.min(X[:, i]),
                                                np.max(X[:, i])))
        le = preprocessing.LabelEncoder()
        le.fit(Y)
        Y = le.transform(Y)

    print(np.amin(X, axis=0))
    print(np.amax(X, axis=0))
    print(np.var(X, axis=0))
    print('1-----------------------------------------------')
    if scale_data:
        X = preprocessing.scale(X)
        #X = preprocessing.MinMaxScaler().fit_transform(X)
        # for i in range(X.shape[1]):
        #     print('Min-Max {0}: {1}-{2}'.format(i, np.min(X[:, i]), np.max(X[:, i])))

        print(np.amin(X, axis=0))
        print(np.amax(X, axis=0))
        print(np.var(X, axis=0))
        print('2-----------------------------------------------')

    shuffled_indices = np.random.choice(Y.shape[0], Y.shape[0], replace=False)

    X_shuffled = X[shuffled_indices, :]
    Y_shuffled = Y[shuffled_indices]

    return X_shuffled, Y_shuffled
from id3 import Id3Estimator
from sklearn.datasets import fetch_kddcup99
from sklearn.model_selection import train_test_split
from id3 import export_graphviz
import numpy as np

bunch = fetch_kddcup99(subset="SA")

data = bunch.data
data = np.delete(data, np.s_[1:4], axis=1)
target = bunch.target
X_train, X_test, y_train, y_test = train_test_split(data,
                                                    target,
                                                    test_size=.2,
                                                    random_state=17)

estimator = Id3Estimator()
print("->Fitting ID3 classifier")
estimator.fit(X_train, y_train)

print("->Writing dot file")
export_graphviz(estimator.tree_, 'tree.dot')

print("->Calculating predictions")
pred = estimator.predict(X_test)

well_detected = 0
for index, val in enumerate(pred):
    if val == y_test[index]:
        well_detected += 1
Example #33
0
def fetch_kdd(
        target: list = ['dos', 'r2l', 'u2r', 'probe'],
        keep_cols: list = [
            'srv_count', 'serror_rate', 'srv_serror_rate', 'rerror_rate',
            'srv_rerror_rate', 'same_srv_rate', 'diff_srv_rate',
            'srv_diff_host_rate', 'dst_host_count', 'dst_host_srv_count',
            'dst_host_same_srv_rate', 'dst_host_diff_srv_rate',
            'dst_host_same_src_port_rate', 'dst_host_srv_diff_host_rate',
            'dst_host_serror_rate', 'dst_host_srv_serror_rate',
            'dst_host_rerror_rate', 'dst_host_srv_rerror_rate'
        ],
        percent10: bool = True,
        return_X_y: bool = False
) -> Union[Bunch, Tuple[np.ndarray, np.ndarray]]:
    """
    KDD Cup '99 dataset. Detect computer network intrusions.

    Parameters
    ----------
    target
        List with attack types to detect.
    keep_cols
        List with columns to keep. Defaults to continuous features.
    percent10
        Bool, whether to only return 10% of the data.
    return_X_y
        Bool, whether to only return the data and target values or a Bunch object.

    Returns
    -------
    Bunch
        Dataset and outlier labels (0 means 'normal' and 1 means 'outlier').
    (data, target)
        Tuple if 'return_X_y' equals True.
    """

    # fetch raw data
    data_raw = fetch_kddcup99(subset=None, data_home=None, percent10=percent10)

    # specify columns
    cols = [
        'duration', 'protocol_type', 'service', 'flag', 'src_bytes',
        'dst_bytes', 'land', 'wrong_fragment', 'urgent', 'hot',
        'num_failed_logins', 'logged_in', 'num_compromised', 'root_shell',
        'su_attempted', 'num_root', 'num_file_creations', 'num_shells',
        'num_access_files', 'num_outbound_cmds', 'is_host_login',
        'is_guest_login', 'count', 'srv_count', 'serror_rate',
        'srv_serror_rate', 'rerror_rate', 'srv_rerror_rate', 'same_srv_rate',
        'diff_srv_rate', 'srv_diff_host_rate', 'dst_host_count',
        'dst_host_srv_count', 'dst_host_same_srv_rate',
        'dst_host_diff_srv_rate', 'dst_host_same_src_port_rate',
        'dst_host_srv_diff_host_rate', 'dst_host_serror_rate',
        'dst_host_srv_serror_rate', 'dst_host_rerror_rate',
        'dst_host_srv_rerror_rate'
    ]

    # create dataframe
    data = pd.DataFrame(data=data_raw['data'], columns=cols)

    # add target to dataframe
    data['attack_type'] = data_raw['target']

    # specify and map attack types
    attack_list = np.unique(data['attack_type'])
    attack_category = [
        'dos', 'u2r', 'r2l', 'r2l', 'r2l', 'probe', 'dos', 'u2r', 'r2l', 'dos',
        'probe', 'normal', 'u2r', 'r2l', 'dos', 'probe', 'u2r', 'probe', 'dos',
        'r2l', 'dos', 'r2l', 'r2l'
    ]

    attack_types = {}
    for i, j in zip(attack_list, attack_category):
        attack_types[i] = j

    data['attack_category'] = 'normal'
    for k, v in attack_types.items():
        data['attack_category'][data['attack_type'] == k] = v

    # define target
    data['target'] = 0
    for t in target:
        data['target'][data['attack_category'] == t] = 1
    is_outlier = data['target'].values

    # define columns to be dropped
    drop_cols = []
    for col in data.columns.values:
        if col not in keep_cols:
            drop_cols.append(col)

    if drop_cols != []:
        data.drop(columns=drop_cols, inplace=True)

    if return_X_y:
        return data.values, is_outlier

    return Bunch(data=data.values,
                 target=is_outlier,
                 target_names=['normal', 'outlier'],
                 feature_names=keep_cols)
Example #34
0
from sklearn.preprocessing import LabelBinarizer

print(__doc__)

random_state = 2  # to control the random selection of anomalies in SA

# datasets available: ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
datasets = ["http", "smtp", "SA", "SF", "shuttle", "forestcover"]

plt.figure()
for dataset_name in datasets:
    # loading and vectorization
    print("loading data")
    if dataset_name in ["http", "smtp", "SA", "SF"]:
        dataset = fetch_kddcup99(subset=dataset_name,
                                 percent10=True,
                                 random_state=random_state)
        X = dataset.data
        y = dataset.target

    if dataset_name == "shuttle":
        dataset = fetch_openml("shuttle", as_frame=False, parser="pandas")
        X = dataset.data
        y = dataset.target.astype(np.int64)
        # we remove data with label 4
        # normal data are then those of class 1
        s = y != 4
        X = X[s, :]
        y = y[s]
        y = (y != 1).astype(int)
Example #35
0
    if dat == 'synthetic1':

        L, S = gen_synthetic(500, 0.05, 25)
        X = L + S
        print('Data Rank = %d, Data NNZs = %d' %
              (matrix_rank(L), np.count_nonzero(S)))

    if dat == 'synthetic2':

        L, S = gen_synthetic(1000, 0.05, 25)
        X = L + S
        print('Data Rank = %d, Data NNZs = %d' %
              (matrix_rank(L), np.count_nonzero(S)))

    if dat in ['http', 'smtp', 'SA', 'SF']:
        dataset = fetch_kddcup99(subset=dat, shuffle=True, percent10=True)
        X = dataset.data
        y = dataset.target

    if dat == 'shuttle':
        dataset = fetch_mldata('shuttle')
        X = dataset.data
        y = dataset.target
        sh(X, y)
        # we remove data with label 4
        # normal data are then those of class 1
        s = (y != 4)
        X = X[s, :]
        y = y[s]
        y = (y != 1).astype(int)
from sklearn.datasets import fetch_kddcup99, fetch_covtype, fetch_openml
from sklearn.preprocessing import LabelBinarizer

print(__doc__)

random_state = 2  # to control the random selection of anomalies in SA

# datasets available: ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']

plt.figure()
for dataset_name in datasets:
    # loading and vectorization
    print('loading data')
    if dataset_name in ['http', 'smtp', 'SA', 'SF']:
        dataset = fetch_kddcup99(subset=dataset_name, percent10=True,
                                 random_state=random_state)
        X = dataset.data
        y = dataset.target

    if dataset_name == 'shuttle':
        dataset = fetch_openml('shuttle')
        X = dataset.data
        y = dataset.target
        # we remove data with label 4
        # normal data are then those of class 1
        s = (y != 4)
        X = X[s, :]
        y = y[s]
        y = (y != 1).astype(int)

    if dataset_name == 'forestcover':
fig_roc, ax_roc = plt.subplots(1, 1, figsize=(8, 5))

# Set this to true for plotting score histograms for each dataset:
with_decision_function_histograms = False

# datasets available = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']
datasets = ['http', 'smtp', 'SA', 'SF', 'shuttle', 'forestcover']

# Loop over all datasets for fitting and scoring the estimator:
for dat in datasets:

    # Loading and vectorizing the data:
    print('====== %s ======' % dat)
    print('--- Fetching data...')
    if dat in ['http', 'smtp', 'SF', 'SA']:
        dataset = fetch_kddcup99(subset=dat, shuffle=True,
                                 percent10=True, random_state=random_state)
        X = dataset.data
        y = dataset.target

    if dat == 'shuttle':
        dataset = fetch_mldata('shuttle')
        X = dataset.data
        y = dataset.target
        X, y = sh(X, y, random_state=random_state)
        # we remove data with label 4
        # normal data are then those of class 1
        s = (y != 4)
        X = X[s, :]
        y = y[s]
        y = (y != 1).astype(int)
        print('----- ')