Example #1
0
    def _predict_self(self):

        clf = IsolationForest(contamination=self.frac)

        clf.fit(self.num_X)

        return clf.predict(self.num_X)
def test_iforest_subsampled_features():
    # It tests non-regression for #5732 which failed at predict.
    rng = check_random_state(0)
    X_train, X_test, y_train, y_test = train_test_split(boston.data[:50], boston.target[:50], random_state=rng)
    clf = IsolationForest(max_features=0.8)
    clf.fit(X_train, y_train)
    clf.predict(X_test)
def outlier_rejection(X, y):
    model = IsolationForest(max_samples=100,
                            contamination=0.4,
                            random_state=rng)
    model.fit(X)
    y_pred = model.predict(X)
    return X[y_pred == 1], y[y_pred == 1]
def outlier_rejection(X, y):
    """This will be our function used to resample our dataset."""
    model = IsolationForest(max_samples=100,
                            contamination=0.4,
                            random_state=rng)
    model.fit(X)
    y_pred = model.predict(X)
    return X[y_pred == 1], y[y_pred == 1]
Example #5
0
def IsolationForest_calulate(train_data_one,test_data):
    # 使用异常检测方法
    clf = IsolationForest()
    # 训练异常检测模型
    clf.fit(train_data_one)
    # 模型预测
    Pre_result = clf.predict(test_data)
    # 计算多少个概率
    prob = len([x for x in Pre_result if x == 1])/len(Pre_result)
    return prob
Example #6
0
def test_iforest_works():
    # toy sample (the last two samples are outliers)
    X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [-4, 7]]

    # Test LOF
    clf = IsolationForest(random_state=rng)
    clf.fit(X)
    pred = clf.predict(X)

    # assert detect outliers:
    assert_greater(np.min(pred[-2:]), np.max(pred[:-2]))
def test_iforest_works(contamination):
    # toy sample (the last two samples are outliers)
    X = [[-2, -1], [-1, -1], [-1, -2], [1, 1], [1, 2], [2, 1], [6, 3], [-4, 7]]

    # Test IsolationForest
    clf = IsolationForest(random_state=rng, contamination=contamination)
    clf.fit(X)
    decision_func = -clf.decision_function(X)
    pred = clf.predict(X)
    # assert detect outliers:
    assert_greater(np.min(decision_func[-2:]), np.max(decision_func[:-2]))
    assert_array_equal(pred, 6 * [1] + 2 * [-1])
    def isolationForest(self, settings, mname, data):
        '''
        :param settings: -> settings dictionary
        :param mname: -> name of serialized cluster
        :return: -> isolation forest instance
        :example settings: -> {n_estimators:100, max_samples:100, contamination:0.1, bootstrap:False,
                        max_features:1.0, n_jobs:1, random_state:None, verbose:0}
        '''
        # rng = np.random.RandomState(42)
        if settings['random_state'] == 'None':
            settings['random_state'] = None

        if isinstance(settings['bootstrap'], str):
            settings['bootstrap'] = str2Bool(settings['bootstrap'])

        if isinstance(settings['verbose'], str):
            settings['verbose'] = str2Bool(settings['verbose'])

        if settings['max_samples'] != 'auto':
            settings['max_samples'] = int(settings['max_samples'])
        # print type(settings['max_samples'])
        for k, v in settings.iteritems():
            logger.info('[%s] : [INFO] IsolationForest %s set to %s',
                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), k, v)
            print "IsolationForest %s set to %s" % (k, v)
        try:
            clf = IsolationForest(n_estimators=int(settings['n_estimators']), max_samples=settings['max_samples'], contamination=float(settings['contamination']), bootstrap=settings['bootstrap'],
                        max_features=float(settings['max_features']), n_jobs=int(settings['n_jobs']), random_state=settings['random_state'], verbose=settings['verbose'])
        except Exception as inst:
            logger.error('[%s] : [ERROR] Cannot instanciate isolation forest with %s and %s',
                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)
            print "Error while  instanciating isolation forest with %s and %s" % (type(inst), inst.args)
            sys.exit(1)
        # clf = IsolationForest(max_samples=100, random_state=rng)
        # print "*&*&*&& %s" % type(data)
        try:
            clf.fit(data)
        except Exception as inst:
            logger.error('[%s] : [ERROR] Cannot fit isolation forest model with %s and %s',
                         datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d %H:%M:%S'), type(inst), inst.args)
            sys.exit(1)
        predict = clf.predict(data)
        print "Anomaly Array:"
        print predict
        self.__serializemodel(clf, 'isoforest', mname)
        return clf
Example #9
0
def test_iforest_warm_start():
    """Test iterative addition of iTrees to an iForest """

    rng = check_random_state(0)
    X = rng.randn(20, 2)

    # fit first 10 trees
    clf = IsolationForest(n_estimators=10, max_samples=20,
                          random_state=rng, warm_start=True)
    clf.fit(X)
    # remember the 1st tree
    tree_1 = clf.estimators_[0]
    # fit another 10 trees
    clf.set_params(n_estimators=20)
    clf.fit(X)
    # expecting 20 fitted trees and no overwritten trees
    assert len(clf.estimators_) == 20
    assert clf.estimators_[0] is tree_1
def outlier_removal(df, col, method, params):
    if method == 'Isolation Forest':
        do_outlier_removal = IsolationForest(**params)
    if method == 'Local Outlier Factor':
        do_outlier_removal = LocalOutlierFactor(**params)
    else:
        method == None
    do_outlier_removal.fit(np.array(df[col]))
    if method == 'Isolation Forest':
        outlier_scores = do_outlier_removal.decision_function(np.array(df[col]))
        df[('meta', 'Outlier Scores - ' + method + str(params))] = outlier_scores
        is_outlier = do_outlier_removal.predict(np.array(df[col]))
        df[('meta', 'Outliers - ' + method + str(params))] = is_outlier
    if method == 'Local Outlier Factor':
        is_outlier = do_outlier_removal.fit_predict(np.array(df[col]))
        df[('meta', 'Outliers - ' + method + str(params))] = is_outlier
        df[('meta', 'Outlier Factor - ' + method + str(params))] = do_outlier_removal.negative_outlier_factor_
    return df, do_outlier_removal
Example #11
0
    def predict(self, X, window=DEFAULT_WINDOW):
        """
        Predict if a particular sample is an outlier or not.

        :param X: the time series to detect of
        :param type X: pandas.Series
        :param window: the length of window
        :param type window: int
        :return: 1 denotes normal, 0 denotes abnormal.
        """
        x_train = list(range(0, 2 * window + 1)) + list(range(0, 2 * window + 1)) + list(range(0, window + 1))
        sample_features = zip(x_train, X)
        clf = IsolationForest(self.n_estimators, self.max_samples, self.contamination, self.max_feature, self.bootstrap, self.n_jobs, self.random_state, self.verbose)
        clf.fit(sample_features)
        predict_res = clf.predict(sample_features)
        if predict_res[-1] == -1:
            return 0
        return 1
        y = (y != b'normal.').astype(int)
        print_outlier_ratio(y)

    n_samples, n_features = X.shape
    n_samples_train = n_samples // 2

    X = X.astype(float)
    X_train = X[:n_samples_train, :]
    X_test = X[n_samples_train:, :]
    y_train = y[:n_samples_train]
    y_test = y[n_samples_train:]

    print('--- Fitting the IsolationForest estimator...')
    model = IsolationForest(n_jobs=-1, random_state=random_state)
    tstart = time()
    model.fit(X_train)
    fit_time = time() - tstart
    tstart = time()

    scoring = -model.decision_function(X_test)  # the lower, the more abnormal

    print("--- Preparing the plot elements...")
    if with_decision_function_histograms:
        fig, ax = plt.subplots(3, sharex=True, sharey=True)
        bins = np.linspace(-0.5, 0.5, 200)
        ax[0].hist(scoring, bins, color='black')
        ax[0].set_title('Decision function for %s dataset' % dat)
        ax[1].hist(scoring[y_test == 0], bins, color='b', label='normal data')
        ax[1].legend(loc="lower right")
        ax[2].hist(scoring[y_test == 1], bins, color='r', label='outliers')
        ax[2].legend(loc="lower right")
class RamachandranFeature(Feature):
  '''Analyze the phi/psi torsion distributions of proteins.'''
  
  def __init__(self):
    super().__init__()
    self.clf = None
    self.de = None

  def extract(self, input_path, total_num_threads=1, my_id=0):
    '''Extract phi, psi angles from structures in the input path.'''
    for f in self.list_my_jobs(input_path, total_num_threads, my_id):
      if f.endswith('.pdb'):
        self.extract_from_one_file(os.path.join(input_path, f))

  def extract_from_one_file(self, pdb_file):
    '''Extract phi, psi angles from a pdb_file.'''
    structure = data_loading.structure_from_pdb_file(pdb_file)

    for model in structure:
      for chain in model:
        for residue in chain:
          try:
            feature_dict = {'phi' : geometry.get_phi(chain, residue),
                            'psi' : geometry.get_psi(chain, residue)}
            self.feature_list.append(feature_dict)
          except:
            pass

  def visualize(self, transform_features=True):
    '''Visualize the feature statistics.'''
    phis = [ d['phi'] for d in self.feature_list ]
    psis = [ d['psi'] for d in self.feature_list ]

    # Prepare grid points

    xx, yy = np.meshgrid(np.linspace(-np.pi, np.pi, 200), np.linspace(-np.pi, np.pi, 200))
    
    transformed_data = np.c_[xx.ravel(), yy.ravel()]

    if transform_features:
      transformed_data = self.transform_features(transformed_data)
    
    # Draw the decision boundary from the machine learning classifier
    
    if self.clf:

      Z = self.clf.decision_function(transformed_data)
      Z = Z.reshape(xx.shape)

      Z_pred = self.clf.predict(transformed_data)
      Z_pred = Z_pred.reshape(xx.shape)
      
      #plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='darkred')
      plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), Z.max(), 20), cmap=plt.cm.Blues_r)
      #plt.contourf(xx, yy, Z, levels=np.linspace(0, Z.max()), colors='orange')
      plt.contourf(xx, yy, Z_pred, levels=[0.9, 1.1], colors='orange')

    # Draw the density estimation

    if self.de:
      
      Z = self.de.score_samples(transformed_data)
      Z = Z.reshape(xx.shape)
      plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), Z.max(), 7), cmap=plt.cm.Blues_r)

    # Draw the data

    plt.scatter(phis, psis, c='green', s=5)

    # Plot the support vectors if the classifier is SVM

    if isinstance(self.clf, svm.OneClassSVM):

      if transform_features:  
        s_phis = [ machine_learning.cos_sin_to_angle(v[0], v[1]) for v in self.clf.support_vectors_ ]
        s_psis = [ machine_learning.cos_sin_to_angle(v[2], v[3]) for v in self.clf.support_vectors_ ]
        plt.scatter(s_phis, s_psis, c='red')
      else:
        plt.scatter(self.clf.support_vectors_[:][0], self.clf.support_vectors_[:][1], c='red')

    plt.axis([- np.pi, np.pi, - np.pi, np.pi])
    plt.show()

  def save(self, data_path):
    '''Save the data into a csv file.'''
    data = [ (d['phi'], d['psi']) for d in self.feature_list ]
    df = pd.DataFrame(data=data, columns=['phi', 'psi'])
    
    self.append_to_csv(df, os.path.join(data_path, 'rama_features.csv'))

  def load(self, data_path):
    '''Load data from a csv file.'''
    df = pd.read_csv(os.path.join(data_path, 'rama_features.csv'), header=None)
    
    for index, row in df.iterrows():
      self.feature_list.append({'phi':row[0], 'psi':row[1]})

  def transform_features(self, feature_list):
    '''Transform feature representations. The arguement feature_list
    could be a list of dictionary or a list of list.
    '''
    if isinstance(feature_list[0], dict):
      return [machine_learning.angle_to_cos_sin(d['phi']) + machine_learning.angle_to_cos_sin(d['psi'])
              for d in feature_list]

    else:
      return [machine_learning.angle_to_cos_sin(d[0]) + machine_learning.angle_to_cos_sin(d[1])
              for d in feature_list]

  def learn(self, clf_type="OneClassSVM", transform_features=True):
    '''Learn the distribution with a machine learning classifier'''
    # Prepare the training data
  
    all_data = [(d['phi'], d['psi']) for d in self.feature_list]
    if transform_features:
      all_data = self.transform_features(all_data)
    n_data = len(all_data)

    training_data = all_data[0:int(0.6 * n_data)]
    test_data = all_data[int(0.6 * n_data):int(0.8 * n_data)]
    cv_data = all_data[int(0.8 * n_data):n_data]

    # Train the classifier 

    if clf_type == "OneClassSVM":
      nus = [0.05, 0.02, 0.01, 0.005, 0.002, 0.001]
      least_error = len(test_data)

      for i in range(len(nus)):
        print("nu = {0}".format(nus[i]))

        clf = svm.OneClassSVM(nu=nus[i], kernel="rbf", gamma='auto')
        clf.fit(training_data)
        
        predictions = clf.predict(training_data)
        print("{0}/{1} training error.".format(len(predictions[-1 == predictions]), len(training_data)))
        
        predictions = clf.predict(test_data)
        print("{0}/{1} test error.\n".format(len(predictions[-1 == predictions]), len(test_data)))

        if len(predictions[-1 == predictions]) < least_error:
          least_error = len(predictions[-1 == predictions])
          self.clf = clf
    
    elif clf_type == "IsolationForest": 
      self.clf = IsolationForest(max_samples=20000,
			contamination=0.01, random_state=np.random.RandomState(42))
      self.clf.fit(training_data)
   
    # Print Training results
    
    predictions = self.clf.predict(cv_data)
    print("{0}/{1} cross validation error.".format(len(predictions[-1 == predictions]), len(cv_data)))

    if clf_type == "OneClassSVM":
      print("{0} support vectors found.".format(len(self.clf.support_)))

  def predict(self, input_data, transform_features=True):
    '''Make a prediction for the input data with the machine learning classifier.
    input_data is a list of phi, psi angles.
    '''
    transformed_data = input_data

    if transform_features: 
        transformed_data = self.transform_features(transformed_data)
    
    return self.clf.predict(transformed_data)

  def calculate_space_reduction(self, transform_features=True):
    '''Calculate the space reduction power of the machine learning model.'''
    phis = np.random.uniform(-np.pi, np.pi, 10000)
    psis = np.random.uniform(-np.pi, np.pi, 10000)
    
    predictions = self.predict(list(zip(phis, psis)), transform_features=transform_features)
    print("The space is reduced by {0}.".format(len(predictions[1 == predictions]) / len(predictions)))

  def density_estimate(self, de_type="GaussianMixture", transform_features=True):
    '''Get a density estimation of the data.'''
    all_data = [(d['phi'], d['psi']) for d in self.feature_list]
    if transform_features:
      all_data = self.transform_features(all_data)
    n_data = len(all_data)
    training_data = all_data[0:int(0.7 * n_data)]
    test_data = all_data[int(0.7 * n_data):n_data]

    # Make some random data
    
    phis = np.random.uniform(-np.pi, np.pi, 10000)
    psis = np.random.uniform(-np.pi, np.pi, 10000)
    
    random_data = list(zip(phis, psis))
    if transform_features: 
        random_data = self.transform_features(random_data)

    if de_type == "GaussianMixture":
      self.de = mixture.BayesianGaussianMixture(n_components=100, covariance_type='full').fit(training_data)
      
      # Evalute the cumulative distribution functions of scores of test data

      test_scores = self.de.score_samples(test_data)
      values, base = np.histogram(test_scores, bins=40)
      cumulative = np.cumsum(values)

      for i in range(40):
        
        # Evaluate the space compression

        random_scores = self.de.score_samples(random_data)
        compress_coe = len(random_scores[random_scores > base[i]]) / len(random_scores)
          
        print('{0:.3f}\t{1}\t{2:.5f}\t{3:.5f}'.format(base[i], cumulative[i], cumulative[i] / len(test_data), compress_coe))


    elif de_type == "KernelDensity":
      params = {'bandwidth': np.logspace(-1, 1, 5)}
      grid = GridSearchCV(KernelDensity(), params)
      grid.fit(training_data) 
      self.de = grid.best_estimator_
    filtered_data = df.loc[keys_filter]
    filtered_data = filtered_data.sort_values(by=['Months Since Start'],
                                              ascending=False)

    if len(filtered_data['Full Work Day']) > 12:

        #       sorted_data = filtered_data.sort_values('Full Work Day')
        X_full = filtered_data[['Months Since Start']].values
        y_full = filtered_data['Full Work Day'].values

        # isolation forest to detect anomalies

        y_full_reshape = y_full.reshape(-1, 1)

        clf = IsolationForest(random_state=rng)
        clf.fit(y_full_reshape)
        clf_predicted_amount = clf.predict(y_full_reshape)

        anomaly_test_false = clf_predicted_amount == 1
        anomaly_test_true = clf_predicted_amount == -1

        X = X_full[anomaly_test_false]
        y = y_full[anomaly_test_false]

        X_upto_last_6months = X[0:-6]
        y_upto_last_6months = y[0:-6]
        X_last_6months = X[-6:]
        y_last_6months = y[-6:]

        # run regression
Example #15
0
# Detect unusual sequence but not extreme value. More difficult to evaluate the relevance on this example. The sequence size (5) should be match with some interesting cycle.
# ## 2.5 Isolation Forest
# #### Use for collective anomalies (unordered).
# Simple, works well with different data repartition and efficient with high dimention data.

# In[ ]:

# Take useful feature and standardize them
data = df[['value', 'hours', 'daylight', 'DayOfTheWeek', 'WeekDay']]
min_max_scaler = preprocessing.StandardScaler()
np_scaled = min_max_scaler.fit_transform(data)
data = pd.DataFrame(np_scaled)
# train isolation forest
model = IsolationForest(contamination=outliers_fraction)
model.fit(data)
# add the data to the main
df['anomaly25'] = pd.Series(model.predict(data))
df['anomaly25'] = df['anomaly25'].map({1: 0, -1: 1})
print(df['anomaly25'].value_counts())

# In[ ]:

# visualisation of anomaly throughout time (viz 1)
fig, ax = plt.subplots()

a = df.loc[df['anomaly25'] == 1, ['time_epoch', 'value']]  #anomaly

ax.plot(df['time_epoch'], df['value'], color='blue')
ax.scatter(a['time_epoch'], a['value'], color='red')
plt.show()
Example #16
0
    #########################################################################

    index += 1

print(x_train.shape)
X_train = x_train

rng = np.random.RandomState(42)
isofortrain = IsolationForest(n_estimators=1000,
                              max_samples='auto',
                              contamination=.20,
                              max_features=1,
                              random_state=rng,
                              n_jobs=-1)

isofortrain.fit(X_train)
anomalytrain = isofortrain.decision_function(X_train)
predicttrain = isofortrain.predict(X_train)

len_predictrain = len(predicttrain)
print("len_predictrain", len_predictrain)

num_iforest_diff = 0

for i in predicttrain:
    if i == -1:
        num_iforest_diff += 1

print("num_iforest_diff", num_iforest_diff)

same = 0
def InputOutlierDetection(xtrain,
                          xtest,
                          ytrain,
                          ytest,
                          outlier_percent=0.2,
                          removal=None,
                          isoforest=None,
                          randstate=None,
                          onlytrain=False,
                          n_estimators=100):
    from sklearn.ensemble import IsolationForest
    print('\nExecuting [InputOutlierDetection] using Isolation Forest...')
    # If no current Isolation Forest exists, so to learn the current data to train the Isolation Forest model
    if isoforest is None:
        isoforest = IsolationForest(n_jobs=-1,
                                    verbose=2,
                                    contamination=outlier_percent,
                                    random_state=randstate,
                                    n_estimators=n_estimators,
                                    bootstrap=True)
        # Train the isolation forest to define and detect outliers
        isoforest.fit(xtrain)
        # If I just intend to train an Isolation Forest, then return the Isolation Forest and end the function
        if onlytrain:
            return isoforest

    # Yield score arrays on the training and test data in which -1 means anomaly
    xtrain_anomalyscore = isoforest.predict(xtrain)
    # If testSize = 0., then skip test data prediction
    try:
        xtest_anomalyscore = isoforest.predict(xtest)
    except:
        xtest_anomalyscore = []

    # meanScoreTrain = isoforest.decision_function(xtrain)
    # meanScoreTest = isoforest.decision_function(xtest)
    # print('Train data anomaly score (higher is better): {0}'.format(meanScoreTrain))
    # print('Test data anomaly score (higher is better): {0}'.format(meanScoreTest))

    # Get the index array of all data considered abnormal (-1)
    anomaly_idx_train = np.where(xtrain_anomalyscore == -1)
    anomaly_idx_test = np.where(xtest_anomalyscore == -1)
    anomaly_idx_train = anomaly_idx_train[0]
    anomaly_idx_test = anomaly_idx_test[0]

    # Initialize "empty" array/list for outliers
    xtrainOutliers = np.empty([1, xtrain.shape[1]])
    # ytrainOutliers = np.empty(len(anomaly_idx_train))
    ytrainOutliers = np.empty([1, ytrain.shape[1]])
    # If xtest is a an empty list then skip this step
    try:
        xtestOutliers = np.empty([1, xtest.shape[1]])
        ytestOutliers = np.empty([1, ytest.shape[1]])
    except AttributeError:
        xtestOutliers = ['dummy']
        ytestOutliers = ['dummy']

    xtrainRaw = xtrain
    ytrainRaw = ytrain
    # Remove the anomaly indices iteratively
    for i, idx in enumerate(anomaly_idx_train):
        xtrainOutliers = np.vstack((xtrainOutliers, xtrainRaw[idx]))
        # ytrainOutliers[i] = ytrain[idx]
        ytrainOutliers = np.vstack((ytrainOutliers, ytrainRaw[idx]))
        # If removal is 'train' or 'both, remove the outliers in the train input and target data
        if removal in ('train', 'both'):
            ytrain = np.delete(ytrain, idx - i, 0)
            # 0 means the first axis -- row
            xtrain = np.delete(xtrain, idx - i, 0)

    # # Update xtrain and ytrain if removal was done
    # if removal in('train', 'both'):
    #     xtrain = xtrainInliers
    #     ytrain = ytrainInliers

    xtestRaw = xtest
    ytestRaw = ytest
    # When anomaly_idx_test is empty, this loop will not execute
    for i, idx in enumerate(anomaly_idx_test):
        xtestOutliers = np.vstack((xtestOutliers, xtestRaw[idx]))
        # ytestOutliers[i] = ytest[idx]
        ytestOutliers = np.vstack((ytestOutliers, ytestRaw[idx]))
        # If removal is 'test' or 'both', then remove the outliers in test input and target data
        if removal in ('test', 'both'):
            ytest = np.delete(ytest, idx - i, 0)
            xtest = np.delete(xtest, idx - i, 0)

    # Since the arrays were not actually empty when they were initiated, remove the first row, first 0 means "first", second 0 means "row"
    xtrainOutliers = np.delete(xtrainOutliers, 0, 0)
    ytrainOutliers = np.delete(ytrainOutliers, 0, 0)

    # xtestOutliers = xtestOutliers[1:]
    xtestOutliers = np.delete(xtestOutliers, 0, 0)
    ytestOutliers = np.delete(ytestOutliers, 0, 0)

    # Get the outliers for later inspection
    outliers = dict(xtrain=xtrainOutliers,
                    xtest=xtestOutliers,
                    ytrain=ytrainOutliers,
                    ytest=ytestOutliers,
                    anomaly_idx_train=anomaly_idx_train,
                    anomaly_idx_test=anomaly_idx_test,
                    xtrain_anomalyscore=xtrain_anomalyscore,
                    xtest_anomalyscore=xtest_anomalyscore)

    return xtrain, xtest, ytrain, ytest, outliers, isoforest
Example #18
0
def main():
    start = time.clock()
    URLKeyword, URLchar, action, title = get_dic()

    #load training dataset
    mainfile = './data/file_list_20170430_new的副本.txt'
    WebDirectory = './data/file的副本/'
    MD5_list, flag_list, URL_list = traverse_directory(WebDirectory, mainfile)
    X_train = list()
    Y_train = flag_list
    for i in range(len(MD5_list)):
        URL = URL_list[i]
        Web_data = read_file(MD5_list[i])
        web_vec = Web_feature(Web_data, title, action, MD5_list[i])
        URL_vec = URL_feature(URL, URLKeyword, URLchar)
        feature = np.hstack((web_vec, URL_vec))
        X_train.append(feature)
        # print(len(feature))
    print(len(X_train), len(Y_train))

    X_train = np.asarray(X_train)
    Y_train = np.asarray(Y_train)
    print(X_train.shape, Y_train.shape)

    #feature selection
    # for a_fea in range(70,60,-2):

    X_train, Y_train, F_index = feature_selection(X_train, Y_train, 70)
    # print(F_index)

    #train model
    # tuned_parameters = {'n_estimators': range(10, 120, 10), "max_samples": range(70, 270, 20),'contamination'
    #                 }
    clf = IsolationForest(contamination=0.06,
                          n_estimators=90,
                          max_samples=150,
                          bootstrap=True)
    clf.fit(X_train, Y_train)
    # print("best parameter:", clf.best_params_)
    # print(clf.grid_scores_)
    # joblib.dump(clf,'Isolation_model.m')
    middle = time.clock()
    print(middle - start)
    y_pred = clf.predict(X_train)
    print('Accuracy Score(normalize=True):',
          accuracy_score(Y_train, y_pred, normalize=True))
    evaluate_model(Y_train, y_pred)
    end = time.clock()
    print(end - middle)
    # print("Testing Score:%f"%clf.score(X_test,y_test))

    #load testing dataset
    mainfile1 = './data/file_list_10000.txt'
    WebDirectory1 = './data/file1/'
    MD5_list1, flag_list1, URL_list1 = traverse_directory_t(
        WebDirectory1, mainfile1)
    X_test = list()
    Y_test = flag_list1
    for h in range(len(MD5_list1)):
        s_fea = []
        URL1 = URL_list1[h]
        Web_data1 = read_file(MD5_list1[h])
        web_vec1 = Web_feature(Web_data1, title, action, MD5_list1[h])
        URL_vec1 = URL_feature(URL1, URLKeyword, URLchar)
        feature1 = np.hstack((web_vec1, URL_vec1))
        for j in F_index:
            s_fea.append(feature1[j])
        X_test.append(s_fea)
        # print("********")
    print(len(X_test), len(Y_test))

    #test model
    y_tpred = clf.predict(X_test)
    print('Accuracy Score(normalize=True):',
          accuracy_score(Y_test, y_tpred, normalize=True))
    evaluate_model(Y_test, y_tpred)
    end2 = time.clock()
    print(end2 - end)
Example #19
0
            # X = X[indices]
            # y = y[indices]

            X_train = X[:n_samples_train, :]
            X_test = X[n_samples_train:, :]
            y_train = y[:n_samples_train]
            y_test = y[n_samples_train:]

            # # training only on normal data:
            # X_train = X_train[y_train == 0]
            # y_train = y_train[y_train == 0]

            print('IsolationForest processing...')
            model = IsolationForest()
            tstart = time()
            model.fit(X_train)
            fit_time += time() - tstart
            tstart = time()

            scoring = -model.decision_function(X_test)  # the lower,the more normal
            predict_time += time() - tstart
            fpr_, tpr_, thresholds_ = roc_curve(y_test, scoring)

            if predict_time + fit_time > max_time:
                raise TimeoutError

            f = interp1d(fpr_, tpr_)
            tpr += f(x_axis)
            tpr[0] = 0.

            precision_, recall_ = precision_recall_curve(y_test, scoring)[:2]
def Eval(clargs):    
    __version__ = '1.0'
    usage = """train_flows [options] normaldatafile"""
    parser = OptionParser(usage=usage, version=__version__)

    parser.add_option("-x", "--vectorizerfile", action="store", type="string", \
                      default='/tmp/vectorizers.pkl', help="")
    parser.add_option("-v", "--verbose", action="store_true", default=False, \
                      help="enable verbose output")
    parser.add_option("-o", "--maliciousdatafile", action="store", type="string", \
                      default=None, help="An optional file of malicious http logs")
    parser.add_option("-m", "--maxfeaturesperbag", action="store", type="int", \
                      default=100, help="maximum number of features per bag")
    parser.add_option("-g", "--ngramsize", action="store", type="int", \
                      default=7, help="ngram size")

    parser.add_option("-f", "--features", action="store", type="string", \
                      default="01000100111111111111", help="An optional file for choosing which features to be extracted")
    parser.add_option("-t", "--maxtrainingfeatures", action="store", type="int", \
                      default=50000, help="maximum number of rows to train with per class")
    parser.add_option("-n", "--numtrees", action="store", type="int", \
                      default=200, help="number of trees in isolation forest")
    parser.add_option("-s", "--numsamples", action="store", type="int", \
                      default=8192, help="number of samples in each tree")


    Start=time.time()
    (opts, args) = parser.parse_args(clargs)

    if len(args) != 2:
        parser.error('Incorrect number of arguments')

    ftu=[]
    features = opts.features

    for i, j in enumerate(features):
      if opts.verbose: print(j, all_fields[i])
      if j == 1 or j=='1':
        ftu.append(all_fields[i])

    if opts.verbose: print ftu
    #ftu = ['method', 'user_agent', 'status_code']


    # load the http data in to a data frame
    print('Loading HTTP data')
    df = load_brofile(args[0], fields_to_use)
    trainDf = load_brofile(args[1], fields_to_use)


    total_rows = len(df.index)
    if opts.verbose: print('Total number of rows: %d' % total_rows)
    if opts.maliciousdatafile != None:
      print('Reading malicious training data')
      df1 = load_brofile(opts.maliciousdatafile, fields_to_use)
      if opts.verbose: print('Read malicious data with %s rows ' % len(df1.index))
      #if (len(df1.index) > opts.maxtrainingfeatures):
      #  if opts.verbose: print('Too many malicious samples for training, downsampling to %d' % opts.maxtrainingfeatures)
      #  df1 = df1.sample(n=opts.maxtrainingfeatures)

      #set the classes of the dataframes and then stitch them together in to one big dataframe
      df['class'] = 0
      df1['class'] = 1
      classedDf = pd.concat([df,df1], ignore_index=True)
    else:
      #we weren't passed a file containing class-1 data, so we should generate some of our own.
      noiseDf = create_noise_contrast(df, numSamples)
      if opts.verbose: print('Added %s rows of generated malicious data'%numSamples)
      df['class'] = 0
      noiseDf['class'] = 1
      classedDf = pd.concat([df,noiseDf], ignore_index=True)

    #that doesn't matter
    trainDf['class']=0;


    #spliting into training and evaluation sets 
    classedDf['is_train']=False
    trainDf['is_train']=True

    enhancedDf = enhance_flow(pd.concat([trainDf,classedDf], ignore_index=True), ftu)
    # construct some vectorizers based on the data in the DF. We need to vectorize future log files the exact same way so we
    # will be saving these vectorizers to a file.

    vectorizers = build_vectorizers(enhancedDf, ftu, max_features=opts.maxfeaturesperbag, ngram_size=opts.ngramsize, verbose=opts.verbose)

    #use the vectorizers to featureize our DF into a numeric feature dataframe
    featureMatrix = featureize(enhancedDf, ftu, vectorizers, verbose=opts.verbose)

    #add the class column back in (it wasn't featurized by itself)
    featureMatrix['class'] = enhancedDf['class']
    featureMatrix['is_train'] = enhancedDf['is_train']


    #split out the train and test df's into separate objects
    train, test = featureMatrix[featureMatrix['is_train']==True], featureMatrix[featureMatrix['is_train']==False]

    #drop the is_train column, we don't need it anymore
    train = train.drop('is_train', axis=1)
    test = test.drop('is_train', axis=1)


    #print('Calculating features')


    Trees=opts.numtrees
    Samples=opts.numsamples
    clf = IsolationForest(n_estimators=Trees, max_samples=Samples)

    
    clf.fit(train.drop('class', axis=1))

    testnoclass = test.drop('class', axis=1)

    print('Predicting')

    test.is_copy = False

    test['prediction'] = clf.decision_function(testnoclass) + 0.5

    print('Analyzing')
    #get the class-1 (outlier/anomaly) rows from the feature matrix, and drop the prediction so we can investigate them

    ##From Here
    Left=0.001 
    Right=0.01
    
    fpr, tpr, thresholds = roc_curve(test['class'], test['prediction'], pos_label=0)
    
    F=interpolate.interp1d(fpr, tpr, assume_sorted=True)
    x=np.logspace(np.log10(Left), np.log10(Right))
    y=F(x)
    roc_auc=auc(x, y)

    plt.figure()
    plt.xscale('log')

    plt.plot(fpr, tpr, color='b')
    plt.plot(x,y, color='r')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver operating characteristic')


    plt.plot(plt.xlim(), plt.ylim(), ls="--", c=".3")
    plt.savefig("fig3.png")
    plt.clf()
    plt.close('all')


    print('Area Under the Curve = %.6f' %(roc_auc))



    Min, Sec= divmod( int(time.time() - Start), 60 )
    #print Min, Sec

    target= open('Results.txt', 'a')
    target.write(str(Trees)+' ')
    target.write(str(Samples)+' ')
    target.write(str(Min)+' ')
    target.write(str(Sec)+' ')
    target.write(str(roc_auc))
    target.write("\n")
    target.write(str(features))
    target.write("\n")
    target.write("\n")
    target.close()

    
    print("Minutes: %d, Seconds: %d" % (int(Min), int(Sec)) )
    return roc_auc 
print("Training: One Class SVM (Linear) : ",(Train_Accuracy(train_AD_L)),"%")
print("Test: One Class SVM (Linear) : ",(Test_Accuracy(test_AD_L)),"%")


# # Isolation Forest

# In[58]:

from sklearn.ensemble import IsolationForest


# In[59]:

IFA=IsolationForest()
IFA.fit(Negatives)


# In[60]:

train_IFA=IFA.predict(Negatives)
test_IFA=IFA.predict(Positives)


# In[61]:

print("Training: Isolation Forest: ",(Train_Accuracy(train_IFA)),"%")
print("Test: Isolation Forest: ",(Test_Accuracy(test_IFA)),"%")


# Isolation Forest has worked way better than one class SVM. Thus, considered as best anomaly detection model.
Example #22
0
cur_dir=os.getcwd()
input_path=os.path.join(cur_dir,args["dataset"])         #whatever test_directory is named
frame=frame_from_dir(input_path)
data=[]

model = DenseNet201(weights='imagenet', include_top=False)
for xy in range(len(frame)):   #frame by frame passed through feature extractor to extract feature
    img_path = os.path.join(input_path,frame[xy])
    img = image.load_img(img_path, target_size=(530, 700))
    img_data = image.img_to_array(img)
    img_data = np.expand_dims(img_data, axis=0)
    img_data = preprocess_input(img_data)    
    vgg_feature = model.predict(img_data)
    data.append(vgg_feature)

a=np.array(data)
a=a.reshape(len(frame),-1)   #converting to a feature vector

# train the anomaly detection model
print("[INFO] fitting anomaly detection model...")

modelanom = IsolationForest(n_estimators=150, contamination=0.01,
	random_state=42)

modelanom.fit(a)

# serialize the anomaly detection model to disk
f = open(args["model"], "wb")
f.write(pickle.dumps(modelanom))
f.close()
Example #23
0
target = lat.flatten()

for i in range(len(target)):
    if (target[i] == 1):
        target[i] = -1
    elif (target[i] == 0):
        target[i] = 1
    print(target[i])

scaler.fit(lon)
normalised_input_data = scaler.transform(lon)

print(type(normalised_input_data))

clf = IsolationForest(max_samples=100, random_state=42, contamination=.35)
clf.fit(normalised_input_data)
y_pred = clf.predict(normalised_input_data)
accu = 0.000
print(list(y_pred).count(-1))
print(len(y_pred))
no_outliers = list(y_pred).count(-1)
l = len(y_pred)
accu = no_outliers / l
print(accu)
print("Accuracy in Detecting Fraud Cases:", accu)

print(y_pred)
print(target)

plt.subplot(2, 1, 1)
plt.scatter(normalised_input_data[:, 0],
Example #24
0
def anomaly_detection(testdata_name,rank_method_index,test_EVs_ts,test_MVs_ts):
    # Local Outlier Factor
    from sklearn.neighbors import LocalOutlierFactor
    from myFunctions import gen_dist_mat
    
    #
    experimentName = '{}_LOF'.format(testdata_name)
    # Choose ranking method
    # rank_group = rank_high_low
    rank_group = rank_methods[rank_method_index]
    rank_method_name = rank_methods_names[rank_method_index]
    
    test_weather_ts = test_EVs_ts[0] # test weather data
    
    # MV_index = 0 # MV we are examining
    MV_predictions = []
    for MV_index in range(len(MVs)):
        predictions = []
        for n in range(test_weather_ts.shape[0]):
            # The 20th closest weather data
            weather_group = rank_group(weather_ts,test_weather_ts[n])['Day'][:20]
    
            print('{} - group length:{}'.format(n,len(weather_group)))
            if len(weather_group) < 10:
                predictions.append('len<')
                continue
            
    
            # reshape to row array to concatenate
            test_data_point = test_MVs_ts[MV_index,n].reshape((1,MVs_ts[MV_index,weather_group].shape[1]))
            # concatenated matrix of training data and the test data sample
            NT_data = np.concatenate((MVs_ts[MV_index,weather_group],test_data_point),axis = 0)
            
            LOF = LocalOutlierFactor(n_neighbors = 3,metric='precomputed')
            D = gen_dist_mat(NT_data) # distance matrix
            
            # if distance matrix are all zeros(all TS are identical), then skip this
            if len(D[D == 0]) == D.shape[0]*D.shape[1]:
                predictions.append('D=0')
                continue
                
            pred = LOF.fit_predict(D)
            predictions.append(str(pred[-1])) # change to string to avoid comparison error in numpy later
            
            # if detected as outlier, save plot of MVs
            if pred[-1] == -1:
                plt.figure()
                # # draw only the current MV-----
                for c in weather_group:
                    plt.plot(MVs_ts[MV_index,c],color='steelblue',alpha=0.5,linestyle='dotted')
                plt.plot(test_MVs_ts[MV_index,n],color='gold')
                #--------------------------------
                
                # # draw for all MVs-------------
                # for index in range(MVs_ts.shape[0]):
                #     for c in combination:
                #         plt.plot(MVs_ts[index,c],color=color_list[index],alpha=0.5,linestyle='dotted')
                #     plt.plot(test_MVs_ts[index,n],color='gold')
                # plt.show()
                # -------------------------------
                
                dir_loc = r'C:\Users\James\Desktop\python_figs\rank\{}\{}\{}'.format(rank_method_name,experimentName,MVs[MV_index])
                # check directory if exists
                if not os.path.exists(dir_loc):
                    os.makedirs(dir_loc)
                # save faulty plot
                plt.savefig(dir_loc + '\\n{}.png'.format(n))
                plt.close()
            
        MV_predictions.append(np.array(predictions))
    
    
    p_fault = np.empty(MV_predictions[0].shape,dtype = np.bool) # faulty
    p_normal = np.empty(MV_predictions[0].shape,dtype = np.bool) # normal
    p_lack = np.empty(MV_predictions[0].shape,dtype = np.bool) # lack of data
    p_fault[:] = False
    p_normal[:] = True # False
    p_lack[:] = True # False
    for predictions in MV_predictions:
        p_fault = np.logical_or(p_fault, predictions=='-1')
        normal_with_identical = np.logical_or(predictions=='1',predictions=='D=0')
        p_normal = np.logical_and(p_normal,normal_with_identical)
        p_lack = np.logical_and(p_lack, predictions=='len<')
        
    # the indices of ts sample which are considered faulty
    fault_index = np.arange(len(p_fault))[p_fault]
    normal_index = np.arange(len(p_normal))[p_normal]
    lack_index = np.arange(len(p_lack))[p_lack]
    
    # print results:
    fd_rate = 'Fault detection rate:\t {}%'.format(len(fault_index)/test_weather_ts.shape[0]*100)
    nd_rate = 'Normal operation rate:\t {}%'.format(len(normal_index)/test_weather_ts.shape[0]*100)
    ld_rate = 'Lack of data rate:\t {}%'.format(len(lack_index)/test_weather_ts.shape[0]*100)
    
    print(fd_rate)
    print(nd_rate)
    print(ld_rate)
    
    # Save results:
    dir_loc = r'N:\HVAC_ModelicaModel_Data\python_figs\rank\{}\{}'.format(rank_method_name,experimentName)
    with open(dir_loc+'\\results.txt','w') as f:
        f.write(fd_rate + '\n' + nd_rate+ '\n' + ld_rate)
    
    
    
    
    
    
    
    # Isolation Forest
    
    from sklearn.ensemble import IsolationForest
    from myFunctions import gen_dist_mat
    
    #
    experimentName = '{}_IsolationForest'.format(testdata_name)
    # Choose ranking method
    # rank_group = rank_high_low
    rank_group = rank_methods[rank_method_index]
    rank_method_name = rank_methods_names[rank_method_index]
    
    # test_weather_ts = test_EVs_ts[0] # test weather data
    
    # MV_index = 0 # MV we are examining
    MV_predictions = []
    for MV_index in range(len(MVs)):
        predictions = []
        for n in range(test_weather_ts.shape[0]):
            # The 20th closest weather data
            weather_group = rank_group(weather_ts,test_weather_ts[n])['Day'][:20]
            
            print('{} - group length:{}'.format(n,len(weather_group)))
            if len(weather_group) < 10:
                predictions.append('len<')
                continue
            
    
            # reshape to row array to concatenate
            test_data_point = test_MVs_ts[MV_index,n].reshape((1,MVs_ts[MV_index,weather_group].shape[1]))
            # concatenated matrix of training data and the test data sample
            NT_data = np.concatenate((MVs_ts[MV_index,weather_group],test_data_point),axis = 0)
            
            D = gen_dist_mat(NT_data) # distance matrix
            
            # if distance matrix are all zeros(all TS are identical), then skip this
            if len(D[D == 0]) == D.shape[0]*D.shape[1]:
                predictions.append('D=0')
                continue
            
            IsoForest = IsolationForest()
            IsoForest.fit(NT_data)
            pred = IsoForest.predict(NT_data)    
            
            predictions.append(str(pred[-1])) # change to string to avoid comparison error in numpy later
            
            # if detected as outlier, save plot of MVs
            if pred[-1] == -1:
                plt.figure()
                # # draw only the current MV-----
                for c in weather_group:
                    plt.plot(MVs_ts[MV_index,c],color='steelblue',alpha=0.5,linestyle='dotted')
                plt.plot(test_MVs_ts[MV_index,n],color='gold')
                #--------------------------------
                
                # # draw for all MVs-------------
                # for index in range(MVs_ts.shape[0]):
                #     for c in combination:
                #         plt.plot(MVs_ts[index,c],color=color_list[index],alpha=0.5,linestyle='dotted')
                #     plt.plot(test_MVs_ts[index,n],color='gold')
                # plt.show()
                # -------------------------------
                
                dir_loc = r'N:\HVAC_ModelicaModel_Data\python_figs\rank\{}\{}\{}'.format(rank_method_name,experimentName,MVs[MV_index])
                # check directory if exists
                if not os.path.exists(dir_loc):
                    os.makedirs(dir_loc)
                # save faulty plot
                plt.savefig(dir_loc + '\\n{}.png'.format(n))
                plt.close()
            
        MV_predictions.append(np.array(predictions))
    
    
    p_fault = np.empty(MV_predictions[0].shape,dtype = np.bool) # faulty
    p_normal = np.empty(MV_predictions[0].shape,dtype = np.bool) # normal
    p_lack = np.empty(MV_predictions[0].shape,dtype = np.bool) # lack of data
    p_fault[:] = False
    p_normal[:] = True # False
    p_lack[:] = True # False
    for predictions in MV_predictions:
        p_fault = np.logical_or(p_fault, predictions=='-1')
        normal_with_identical = np.logical_or(predictions=='1',predictions=='D=0')
        p_normal = np.logical_and(p_normal,normal_with_identical)
        p_lack = np.logical_and(p_lack, predictions=='len<')
        
    # the indices of ts sample which are considered faulty
    fault_index = np.arange(len(p_fault))[p_fault]
    normal_index = np.arange(len(p_normal))[p_normal]
    lack_index = np.arange(len(p_lack))[p_lack]
    
    # print results:
    fd_rate = 'Fault detection rate:\t {}%'.format(len(fault_index)/test_weather_ts.shape[0]*100)
    nd_rate = 'Normal operation rate:\t {}%'.format(len(normal_index)/test_weather_ts.shape[0]*100)
    ld_rate = 'Lack of data rate:\t {}%'.format(len(lack_index)/test_weather_ts.shape[0]*100)
    
    print(fd_rate)
    print(nd_rate)
    print(ld_rate)
    
    # Save results:
    dir_loc = r'N:\HVAC_ModelicaModel_Data\python_figs\rank\{}\{}'.format(rank_method_name,experimentName)
    with open(dir_loc+'\\results.txt','w') as f:
        f.write(fd_rate + '\n' + nd_rate+ '\n' + ld_rate)
Example #25
0
    # read arguments
    parser = argparse.ArgumentParser()
    parser.add_argument('--data-path', type=str, help='path to the dataset')
    args = parser.parse_args()

    # process data
    df = pd.read_csv(args.data_path, index_col=None, header=None)  # read it
    df = process_data(df)

    # split data
    train, test_norm, test_anom = split_data(df)
    X_train, y_train = train  # unpack training data

    # train model
    model = IsolationForest(random_state=RAND_STATE, n_estimators=50)
    model.fit(X_train.astype('float32'))

    # convert to onnxfrom skl2onnx import convert_sklearn
    initial_types = [('float_input', FloatTensorType([None,
                                                      X_train.shape[1]]))]
    onx = convert_sklearn(model, initial_types=initial_types)
    session = ort.InferenceSession(onx.SerializeToString())
    input_name = session.get_inputs()[0].name
    label_name = session.get_outputs()[0].name
    del onx, model
    model = session, input_name, label_name
    if (DEBUG): print(f'ONNX Runtime Device: {ort.get_device()}')

    # score model
    mlflow.log_metric('F1-Score Training Normal', compute_f1(model, train, 1))
    mlflow.log_metric('F1-Score Testing Normal',
def test_iforest_deprecation():
    iforest = IsolationForest(behaviour='new')
    warn_msg = "'behaviour' is deprecated in 0.22 and will be removed in 0.24"
    with pytest.warns(DeprecationWarning, match=warn_msg):
        iforest.fit(iris.data)
from sklearn.ensemble import IsolationForest

rng = np.random.RandomState(42)

# Generate train data
X = 0.3 * rng.randn(100, 2)
X_train = np.r_[X + 2, X - 2]
# Generate some regular novel observations
X = 0.3 * rng.randn(20, 2)
X_test = np.r_[X + 2, X - 2]
# Generate some abnormal novel observations
X_outliers = rng.uniform(low=-4, high=4, size=(20, 2))

# fit the model
clf = IsolationForest(max_samples=100, random_state=rng)
clf.fit(X_train)
y_pred_train = clf.predict(X_train)
y_pred_test = clf.predict(X_test)
y_pred_outliers = clf.predict(X_outliers)

# plot the line, the samples, and the nearest vectors to the plane
xx, yy = np.meshgrid(np.linspace(-5, 5, 50), np.linspace(-5, 5, 50))
Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.title("IsolationForest")
plt.contourf(xx, yy, Z, cmap=plt.cm.Blues_r)

b1 = plt.scatter(X_train[:, 0], X_train[:, 1], c='white')
b2 = plt.scatter(X_test[:, 0], X_test[:, 1], c='green')
c = plt.scatter(X_outliers[:, 0], X_outliers[:, 1], c='red')
Example #28
0
LABEL = "ddG_offset"


y = dataset["ddG_offset"]

for i in range(20):

	# Split into Train and Test (80/20)
	train, test = train_test_split(dataset, test_size=0.2)

	# Outlier exclusion step:

	with warnings.catch_warnings():
		warnings.simplefilter("ignore")
		outlier_forest = IsolationForest(max_samples = "auto")
		outlier_forest.fit(train)

		y_no_outliers = outlier_forest.predict(train)
		y_no_outliers = pd.DataFrame(y_no_outliers, columns = ['Top'])
		y_no_outliers[y_no_outliers['Top'] == 1].index.values

		train = train.iloc[y_no_outliers[y_no_outliers['Top'] == 1].index.values]
		train.reset_index(drop = True, inplace = True)

		print("Number of outliers in training data:", y_no_outliers[y_no_outliers['Top'] == -1].shape[0])

	# Normalisation step:

	scaler_ddG_offset = StandardScaler()
	mat_ddG_offset = np.array(train.ddG_offset).reshape((len(train)), 1)
	scaler_ddG_offset.fit(mat_ddG_offset)
Example #29
0
# ## Improving the Predicition model ##
# This part is about finding a better metric for predicting future house sales regarding their price.
# 
# First, I will detect outliers and delete them from the dataset if needed.

# ### Detecting Outliers ###
# The first step to improve our learning behaviour is to find outliers and then remove them from the data set if needed.
# To detect outliers I will use the Isolation Forest Algorithm which is good for high-dimensional data sets as we have present here. 

# In[ ]:

from sklearn.ensemble import IsolationForest

clf = IsolationForest(max_samples=100, random_state=rng)
clf.fit(df)
y = clf.predict(df)
print y


# ### Location based prices ###
# House prices don't only depend on the size of the house or amount of rooms, but are also really dependant on the location of said house. To get an idea how the position might impact my data I analyse the relationship between location and price in my dataset.

# In[ ]:

import gmaps
gmaps.configure(api_key="AIzaSyDPWAl8lcrK9q-tOkrl64sGkxDnbWz47Ko")

locations = df[["lat", "long"]]
prices = df["price"]
Example #30
0
pp = PdfPages(plotfolder + 'scatterplots.pdf')
for j, features in enumerate(feature_pairs):
    X, Y = features[0], features[1]
    print j, 'of', len(feature_pairs)
    pair_features = np.array([INFO[features[0]], INFO[features[1]]]).T
    forest = IsolationForest(
        n_estimators=100,  #max_samples=1000,
        random_state=0,
        contamination=num_outlier / 343546.0  # number of nodes
    )
    fig = scatter_plot(INFO[X], INFO[Y], INFO['IDs'], discription[Y],
                       discription[X],
                       discription[Y] + ' vs ' + discription[X],
                       compare_value[X])
    forest.fit(pair_features)
    scores = forest.decision_function(pair_features[outlier_ids, :])
    rank_list = sorted([(outliers[i], -s) for (i, s) in enumerate(scores)],
                       key=lambda x: x[1],
                       reverse=True)
    rank_matrix.append(rank_list)
pp.close()

print rank_matrix

#  runs, properly till this, why is generate_graph returning nothing?
scaled_matrix, normal_matrix = ranklist.generate_graph(P_val, num_outlier,
                                                       rank_matrix)
plots = plotSpot(budget, scaled_matrix, "SpellOut")
frequencies = generate_frequency_list(plots, scaled_matrix)
for i, plot in enumerate(plots):
Example #31
0
    y = mat_data['y']
    file_name = 'experiment_results/' + datasets[i] + '.txt'
    File_object = open(file_name, "w")
    time_all = np.zeros((trials, 5))
    precision_all = np.zeros((trials, 5))
    auc_all = np.zeros((trials, 5))

    for j in range(0, trials):

        print('\n\n******' + datasets[i] + ' trial ' + str(j + 1) +
              '*******\n\n')

        print('\n******Iso-Forest*******\n')
        start = time.time()
        clf = IsolationForest(contamination=0.1, behaviour='new')
        clf.fit(X)
        end = time.time()
        time_all[j, 0] = end - start
        iso_scores = clf.score_samples(X)

        if run_lof_svm == 0:
            lof_scores = iso_scores
            osvm_scores = iso_scores
        elif j == 0:

            print('\n******LOF*******\n')
            start = time.time()
            lof = LocalOutlierFactor()
            lof.fit(X)
            end = time.time()
            time_all[j, 1] = end - start
Example #32
0
    def detect(self):
        '''
        利用孤立森林 isolation forest 进行离群点检测
        '''
        # 获得预处理之后的数据
        data = self.preprocess()

        # 异常点检测
        # 创建 IsolationForest
        ilf = IsolationForest(
            n_estimators=self.n_estimators,
            n_jobs=-1,  # 使用全部cpu
            verbose=self.verbose,
            contamination=self.contamination,  # 离群点的比例
        )

        # 是否保存/加载模型的控制流
        if self.isSaveModel and self.isLoadModel:
            # isSaveModel = True & isLoadModel = True
            # 训练并保存模型到本地,然后继续预测
            # 训练
            print('Model training...')
            ilf.fit(data)
            # 保存模型到本地
            print('Saving model to `%s`...' % self.modelname)
            with open(self.modelname, 'wb') as fp:
                pickle.dump(ilf, fp)
        elif self.isSaveModel:
            # isSaveModel = True & isLoadModel = False
            # 训练并保存模型到本地,然后不再继续预测
            # 训练
            print('Model training...')
            ilf.fit(data)
            # 保存模型到本地
            print('Saving model to `%s`...' % self.modelname)
            with open(self.modelname, 'wb') as fp:
                pickle.dump(ilf, fp)
            print('Don\'t predict.')
            return
        elif self.isLoadModel:
            # isSaveModel = False & isLoadModel = True
            # 直接加载本地模型,然后继续预测
            # 加载本地模型
            print('Loading model from `%s`...' % self.modelname)
            with open(self.modelname, 'rb') as fp:
                ilf = pickle.load(fp)
        else:
            # isSaveModel = False & isLoadModel = False
            # 只训练不保存模型,然后继续预测
            # 训练
            print('Model training...')
            ilf.fit(data)

        # 预测
        print('Outliers predicting...')
        shape = data.shape[0]
        all_pred = []
        all_score = []
        for i in range(int(shape / self.batch) + 1):
            start = i * self.batch
            end = (i + 1) * self.batch
            batch_test = data[start:end]
            # 预测
            # 返回值:+1 表示正常样本, -1表示异常样本
            pred = ilf.predict(batch_test)
            # 返回样本的异常评分。 值越小表示越有可能是异常样本
            score = ilf.decision_function(batch_test)
            all_pred.extend(pred)
            all_score.extend(score)

        data['timestamp'] = self.origin_data['timestamp']
        data['is_outlier'] = all_pred
        data['outlier_score'] = all_score

        # 转换输出列值
        data['timestamp'] = data['timestamp'].astype('int64')
        data.loc[data.is_outlier == 1, 'is_outlier'] = 0
        data.loc[data.is_outlier == -1, 'is_outlier'] = 1

        print('Writing `%s`...' % self.output_filename)
        data.to_csv(self.output_filename,
                    columns=['timestamp', 'outlier_score', 'is_outlier'],
                    header=True,
                    index=0)
Example #33
0
                               random_state= 0,
                               shuffle= True)





'''__________________Anamoly detection________________________________'''

cont = 0.1
IS = IsolationForest(max_samples=300, 
                     contamination=cont, 
                     max_features=1.0, 
                     random_state=0)

IS.fit(_xtrain)

pred_train = IS.predict(_xtrain)
# for i in pred_train: print(i)
print(pred_train[-6]==-1)
num_of_anam = []
for i in pred_train:
  if i == -1 :
    num_of_anam += [1]
  else : 
    None
else : 
  print('number of anomalies given contamination of %s : %d ' %(cont,len(num_of_anam)) )


print(x_value.columns)
print(x_value.shape)

# Print shapes
print(x_value.shape)
print(y_value.shape)

#Algorithms used: Random Isolation, LocalOutlier factor are common  anomaly detection methods
random_isolation = IsolationForest(max_samples=len(x_value),
                                   contamination=outlier_value,
                                   random_state=3)
local_outlier = LocalOutlierFactor(n_neighbors=12, contamination=outlier_value)

n_outlier = len(fraudal_count)
#fit and predict
random_isolation.fit(x_value)
score_prediction = random_isolation.decision_function(x_value)
y_predict_lof = random_isolation.predict(x_value)

y_predict_isf = local_outlier.fit_predict(x_value)
score_prediction = local_outlier.negative_outlier_factor_

#Change the value to 0 for valid and 1 for fradual cases.
y_predict_isf[y_predict_isf == 1] = 0
y_predict_isf[y_predict_isf == -1] = 1
y_predict_lof[y_predict_lof == 1] = 0
y_predict_lof[y_predict_lof == -1] = 1

n_error_isf = (y_predict_isf != y_value).sum()
n_error_lof = (y_predict_lof != y_value).sum()
print("Error value for Isolation forest ", n_error_isf)
Example #35
0
r2_score(train.y, ens3_insample_pred)  # 0.70266651298615024

# Predict
ens3_pred = ens3.predict(df_test)  # LB:

submission         = pd.read_csv('T:/RNA/Baltimore/Jason/ad_hoc/mb/input/sample_submission.csv')
submission.y       = ens3_pred
submission.id      = id
submission.columns = ['ID', 'y']
submission.to_csv('T:/RNA/Baltimore/Jason/ad_hoc/mb/output/layer2_gbreg.csv', index=False)

print("Ensemble Model 4: IsolationForest")
ens4  = IsolationForest(n_estimators=100, max_samples='auto', contamination=0.1, max_features=1.0,
                        bootstrap=False, n_jobs=1, random_state=None, verbose=0)

ens4.fit(df, train.y)

# In Sample R2
ens4_insample_pred = ens4.predict(df)
print(r2_score(train.y, ens4_insample_pred )) #

# Predict
ens4_pred = ens4.predict(df_test) # LB:

submission         = pd.read_csv('T:/RNA/Baltimore/Jason/ad_hoc/mb/input/sample_submission.csv')
submission.y       = ens4_pred
submission.id      = id
submission.columns = ['ID', 'y']
submission.to_csv('T:/RNA/Baltimore/Jason/ad_hoc/mb/output/layer2_isolationforest.csv', index=False)

print("Ensemble Model 5: RandomTreesEmbedding")
Example #36
0
vectorizer = TfidfVectorizer(min_df = 0.0, analyzer="char", sublinear_tf=True, ngram_range=(1,3)) #converting data to vectors
X = vectorizer.fit_transform(queries)
display_scores(vectorizer, X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) #splitting data

badCount = len(badQueries)
validCount = len(validQueries)

# lgs = LogisticRegression(class_weight={1: 2 * validCount / badCount, 0: 1.0}) # class_weight='balanced')
# lgs = LogisticRegression(penalty='l1')
rng = np.random.RandomState(42)
clf = IsolationForest(max_samples=100, random_state=rng, n_jobs=4, contamination=(badCount /validCount + badCount))
print('fitting')
# lgs.fit(X_train, y_train) #training our model
clf.fit(X_train) #training our model
print ('done')

y_pred_train = clf.predict(X_train)
y1 = np.array(y_pred_train)
y2 = np.array(y_train)
print(len(y_pred_train))
print(len(y_train))
print(np.sum(y1 == y2))

##############
# Evaluation #
##############

# predicted = lgs.predict(X_test)
print clf.predict(vectorizer.transform(['/<script>alert(123)</script>']))

esc = x
escala = MinMaxScaler()
escala.fit(esc)
escalada = escala.transform(esc)
pca=PCA(n_components=2)
pca.fit(escalada)
transformada=pca.transform(escalada)

# grafico de los datos
mglearn.discrete_scatter(transformada[:,0], transformada[:,1])

modelo = IsolationForest(n_estimators=100, max_samples=256, contamination=0.02)

modelo.fit(transformada)

predict = modelo.predict(transformada)

"""
Con un nuevo set de datos se volverá a entrenar el modelo, este dataset contiene comportamiento de malware.
"""
dataframe2 = pd.read_csv('trafico_prueba_2016.csv', index_col = 'Time')

dataframe2['Count'] = np.nan

df2 = dataframe2.groupby(['Time', 'Src Port', 'Dst Port', 'Source', 'Protocol', 'Length']).size().reset_index(name='counts')
dfpredict2 = df2.copy()

dfpredict2['Time'] = encoder.fit_transform(dfpredict2['Time'])
dfpredict2['Src Port'] = encoder.fit_transform(dfpredict2['Src Port'])
    training_data = np.asarray(df)
    for t in training_data:
        print(t)

    # TODO - Data pre-processing step
    # Either eliminate the date column altogether or convert it to unix epoch time.

    #standardize - this normalizes between 1 and 1. This data and can be used for plotting simultaneous lines
    scaler = MinMaxScaler().fit(training_data)
    training_data_transformed = scaler.transform(training_data)
    for t in training_data_transformed:
        print(t)

    #create model
    model = IsolationForest()
    model.fit(training_data_transformed)
    prediction = model.predict(training_data_transformed)

    #see classification results
    for p in prediction:
        if p == -1:
            anomaly_count += 1
            print("anomaly ", p)
        else:
            normal_count += 1
            print("normal ", p)
    print("anomaly count: ", anomaly_count)
    print("normal count: ", normal_count)

    #save model & scaler for later application
    pickle.dump(model, open(model_file, 'wb'))
Example #39
0
df_All = shuffle(df_All)

df_X = df_All.drop(["certid", "label"], axis=1, inplace=False)

df_y = df_All["label"]

X_train, X_test, y_train, y_test = train_test_split(df_X, df_y, test_size=0.2)
X_cols = X_train.columns
y_train = y_train.values
y_test = y_test.values


# IF_clf = LocalOutlierFactor(contamination=0.1)
# y_pred_train=IF_clf.fit_predict(X_train)
IF_clf = IsolationForest(n_estimators=1000, contamination=0.1, n_jobs=-1, bootstrap=True)
IF_clf.fit(X_train)
y_pred_train = IF_clf.predict(X_train)

A = pd.DataFrame(X_train, columns=X_cols)
B = pd.DataFrame(y_train, columns=["label_ori"])
C =  pd.DataFrame(y_pred_train, columns=["label_IF"])
print A.shape
print B.shape
print C.shape
new_tran_df = pd.concat([A, B, C], axis=1)

#print new_tran_df

# new_tran_df = new_tran_df[new_tran_df["label_IF"]>0]

new_tran_df_0 = new_tran_df[new_tran_df["label_IF"] == 1]   #孤立森林的正常点
Example #40
0
def anomaly_detection(X, name='anomaly'):
    pr = IsolationForest()
    pr.fit(filter_numerical(X))
    x = pr.predict(filter_numerical(X))
    X[name] = x
    X[name] = X[name].astype(str)
Example #41
0
    li_ = np.array(li)
    x, y = li_.shape
    # if x<100:
    pca = PCA(n_components=lenth)
    li_low = pca.fit_transform(li_)

    rng = np.random.RandomState(42)
    # 构造训练样本
    n_samples = len(li_low)  # 样本总数

    outliers_fraction = 0.4  # 异常样本比例
    X_train = li_low

    # fit the model
    clf = IsolationForest(max_samples=n_samples,
                          random_state=rng,
                          contamination=outliers_fraction)
    clf.fit(X_train)
    y_pred_train = clf.predict(X_train)
    scores_pred = clf.decision_function(X_train)
    print(video_name)
    print(y_pred_train)

    # LOF
    # clf = LocalOutlierFactor(n_neighbors=35, contamination=outliers_fraction)
    # y_pred_train = clf.fit_predict(X_train)
    # print(video_name)
    # print(y_pred_train)
    i = i + 1
print(i)
        # merge
        vehicle = pd.merge(rpm, speed, how = 'outer', on = 'timestamp')

        # drop null values and zero speeds --> neutral gear
        # speed < 200 to remove outliers

        vh = vehicle.dropna(axis = 0)
        vh = vh[(vh['rpm'] > 0) & ((vh['speed'] > 0) & (vh['speed'] < 200))]

        # detect outliers using IsolationForest
        # assume contamination at 0.01 level

        distances = pairwise_distances(vh[['rpm','speed']],vh[['rpm','speed']], metric = 'cosine')
        clf = IsolationForest(max_samples = 100, contamination = 0.01, verbose = 1)
        clf.fit(distances)
        labels = clf.predict(distances)
        vh['outlier'] = labels

        # remove outliers found by IsolationForest
        vh = vh[['rpm','speed']][vh['outlier'] == 1]

        #recompute distances after outlier removal
        distances = pairwise_distances(vh[['rpm','speed']],vh[['rpm','speed']], metric = 'cosine')

        # initialize variable to keep best model, its silhouette score and predicted labels
        best_model = (None, -1, None)

        # iterate over possible number of gears
        # since we want to pick model with best silhouette score, can't start with single cluster (k=1)
Example #43
0
class IForest(BaseDetector):
    """Wrapper of scikit-learn Isolation Forest with more functionalities.

    The IsolationForest 'isolates' observations by randomly selecting a
    feature and then randomly selecting a split value between the maximum and
    minimum values of the selected feature.
    See :cite:`liu2008isolation,liu2012isolation` for details.

    Since recursive partitioning can be represented by a tree structure, the
    number of splittings required to isolate a sample is equivalent to the path
    length from the root node to the terminating node.

    This path length, averaged over a forest of such random trees, is a
    measure of normality and our decision function.

    Random partitioning produces noticeably shorter paths for anomalies.
    Hence, when a forest of random trees collectively produce shorter path
    lengths for particular samples, they are highly likely to be anomalies.

    Parameters
    ----------
    n_estimators : int, optional (default=100)
        The number of base estimators in the ensemble.

    max_samples : int or float, optional (default="auto")
        The number of samples to draw from X to train each base estimator.

            - If int, then draw `max_samples` samples.
            - If float, then draw `max_samples * X.shape[0]` samples.
            - If "auto", then `max_samples=min(256, n_samples)`.

        If max_samples is larger than the number of samples provided,
        all samples will be used for all trees (no sampling).

    contamination : float in (0., 0.5), optional (default=0.1)
        The amount of contamination of the data set, i.e. the proportion
        of outliers in the data set. Used when fitting to define the threshold
        on the decision function.

    max_features : int or float, optional (default=1.0)
        The number of features to draw from X to train each base estimator.

            - If int, then draw `max_features` features.
            - If float, then draw `max_features * X.shape[1]` features.

    bootstrap : boolean, optional (default=False)
        If True, individual trees are fit on random subsets of the training
        data sampled with replacement. If False, sampling without replacement
        is performed.

    n_jobs : integer, optional (default=1)
        The number of jobs to run in parallel for both `fit` and `predict`.
        If -1, then the number of jobs is set to the number of cores.

    random_state : int, RandomState instance or None, optional (default=None)
        If int, random_state is the seed used by the random number generator;
        If RandomState instance, random_state is the random number generator;
        If None, the random number generator is the RandomState instance used
        by `np.random`.

    verbose : int, optional (default=0)
        Controls the verbosity of the tree building process.

    Attributes
    ----------
    estimators_ : list of DecisionTreeClassifier
        The collection of fitted sub-estimators.

    estimators_samples_ : list of arrays
        The subset of drawn samples (i.e., the in-bag samples) for each base
        estimator.

    max_samples_ : integer
        The actual number of samples

    decision_scores_ : numpy array of shape (n_samples,)
        The outlier scores of the training data.
        The higher, the more abnormal. Outliers tend to have higher
        scores. This value is available once the detector is
        fitted.

    threshold_ : float
        The threshold is based on ``contamination``. It is the
        ``n_samples * contamination`` most abnormal samples in
        ``decision_scores_``. The threshold is calculated for generating
        binary outlier labels.

    labels_ : int, either 0 or 1
        The binary labels of the training data. 0 stands for inliers
        and 1 for outliers/anomalies. It is generated by applying
        ``threshold_`` on ``decision_scores_``.
    """

    def __init__(self, n_estimators=100,
                 max_samples="auto",
                 contamination=0.1,
                 max_features=1.,
                 bootstrap=False,
                 n_jobs=1,
                 random_state=None,
                 verbose=0):
        super(IForest, self).__init__(contamination=contamination)
        self.n_estimators = n_estimators
        self.max_samples = max_samples
        self.max_features = max_features
        self.bootstrap = bootstrap
        self.n_jobs = n_jobs
        self.random_state = random_state
        self.verbose = verbose

    def fit(self, X, y=None):
        """Fit detector. y is optional for unsupervised methods.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The input samples.

        y : numpy array of shape (n_samples,), optional (default=None)
            The ground truth of the input samples (labels).
        """
        # validate inputs X and y (optional)
        X = check_array(X)
        self._set_n_classes(y)

        self.detector_ = IsolationForest(n_estimators=self.n_estimators,
                                         max_samples=self.max_samples,
                                         contamination=self.contamination,
                                         max_features=self.max_features,
                                         bootstrap=self.bootstrap,
                                         n_jobs=self.n_jobs,
                                         random_state=self.random_state,
                                         verbose=self.verbose)
        self.detector_.fit(X=X,
                           y=None,
                           sample_weight=None)

        # invert decision_scores_. Outliers comes with higher outlier scores.
        self.decision_scores_ = invert_order(
            self.detector_.decision_function(X))
        self._process_decision_scores()
        return self

    def decision_function(self, X):
        """Predict raw anomaly score of X using the fitted detector.

        The anomaly score of an input sample is computed based on different
        detector algorithms. For consistency, outliers are assigned with
        larger anomaly scores.

        Parameters
        ----------
        X : numpy array of shape (n_samples, n_features)
            The training input samples. Sparse matrices are accepted only
            if they are supported by the base estimator.

        Returns
        -------
        anomaly_scores : numpy array of shape (n_samples,)
            The anomaly score of the input samples.
        """
        check_is_fitted(self, ['decision_scores_', 'threshold_', 'labels_'])
        # invert outlier scores. Outliers comes with higher outlier scores
        return invert_order(self.detector_.decision_function(X))

    @property
    def estimators_(self):
        """The collection of fitted sub-estimators.
        Decorator for scikit-learn Isolation Forest attributes.
        """
        return self.detector_.estimators_

    @property
    def estimators_samples_(self):
        """The subset of drawn samples (i.e., the in-bag samples) for
        each base estimator.
        Decorator for scikit-learn Isolation Forest attributes.
        """
        return self.detector_.estimators_samples_

    @property
    def max_samples_(self):
        """The actual number of samples.
        Decorator for scikit-learn Isolation Forest attributes.
        """
        return self.detector_.max_samples_
Example #44
0
        data = data.drop(cols, axis=1)
        data = data.join(vecData)
    return data, vecData, vec

df, t, v = ohEncoding(df, col, replace=True)

print "Shape after encoding"
print type(df.shape)

df_unlabeled = df.drop("Anomaly", axis=1)
print "Shape of the dataframe without anomaly column: "
print df_unlabeled.shape

clf = IsolationForest(max_samples=6444, verbose=1, n_jobs=-1, contamination=0.255555
                      , bootstrap=True, max_features=9)
clf.fit(df_unlabeled)
pred = clf.predict(df_unlabeled)
# print type(pred)
# print data.shape
# print len(pred)
# print pred
anomalies = np.argwhere(pred == -1)
normal = np.argwhere(pred == 1)
# print anomalies
# print type(anomalies)

df['ISO1'] = pred

# iterate over rows
nLabAno = 0
nDetAno = 0
Example #45
0
train_numerical.fillna(0, inplace=True)
train_categoric = train.select_dtypes(include=["object"])
train_categoric.fillna("NONE", inplace=True)
train = train_numerical.merge(train_categoric,
                              left_index=True,
                              right_index=True)
test = pd.read_csv("./test.csv")
ID = test.Id
test.drop("Id", axis=1, inplace=True)
test_numerical = test.select_dtypes(exclude=["object"])
test_numerical.fillna(0, inplace=True)
test_categoric = test.select_dtypes(include=["object"])
test_categoric.fillna("NONE", inplace=True)
test = test_numerical.merge(test_categoric, left_index=True, right_index=True)
clf = IsolationForest(max_samples=100, random_state=42)
clf.fit(train_numerical)
y_noano = clf.predict(train_numerical)
y_noano = pd.DataFrame(y_noano, columns=["Top"])
train_numerical = train_numerical.iloc[y_noano[y_noano["Top"] ==
                                               1].index.values]
train_numerical.reset_index(drop=True, inplace=True)
train_categoric = train_categoric.iloc[y_noano[y_noano["Top"] ==
                                               1].index.values]
train_categoric.reset_index(drop=True, inplace=True)
train = train.iloc[y_noano[y_noano["Top"] == 1].index.values]
train.reset_index(drop=True, inplace=True)

col_train_num = list(train_numerical.columns)
col_train_num_bis = list(train_numerical.columns)
col_train_cat = list(train_categoric.columns)
col_train_num_bis.remove("SalePrice")
Example #46
0
# In[28]:


def falsenegative_accuracy(values):
    tn = list(values).count(-1)
    total = values.shape[0]
    accuracy = np.round(tn / total, 4)
    return accuracy


# In[29]:

st.subheader("Accuracy score For Isolation forest")
ISF = IsolationForest(random_state=42)
ISF.fit(ins)
falsepositive_isf = ISF.predict(ins)
falsenegative_isf = ISF.predict(outs)
in_accuracy_isf = falsepositive_accuracy(falsepositive_isf)
out_accuracy_isf = falsenegative_accuracy(falsenegative_isf)
st.write("Accuracy in Detecting falsepositive Alarm:", in_accuracy_isf)
st.write("Accuracy in Detecting falsenegative Alarm:", out_accuracy_isf)

# In[30]:

st.subheader("Accuracy score For Local Outlier Factor")
LOF = LocalOutlierFactor(novelty=True)
LOF.fit(ins)
falsepositive_lof = LOF.predict(ins)
falsenegative_lof = LOF.predict(outs)
in_accuracy_lof = falsepositive_accuracy(falsepositive_lof)
Example #47
0
    iforest = IsolationForest()
    lof = LocalOutlierFactor(n_neighbors=20)
    ocsvm = OneClassSVM()

    lim_inf = X.min(axis=0)
    lim_sup = X.max(axis=0)
    volume_support = (lim_sup - lim_inf).prod()
    t = np.arange(0, 100 / volume_support, 0.01 / volume_support)
    axis_alpha = np.arange(alpha_min, alpha_max, 0.0001)
    unif = np.random.uniform(lim_inf, lim_sup,
                             size=(n_generated, n_features))

    # fit:
    print('IsolationForest processing...')
    iforest = IsolationForest()
    iforest.fit(X_train)
    s_X_iforest = iforest.decision_function(X_test)
    print('LocalOutlierFactor processing...')
    lof = LocalOutlierFactor(n_neighbors=20)
    lof.fit(X_train)
    s_X_lof = lof.decision_function(X_test)
    print('OneClassSVM processing...')
    ocsvm = OneClassSVM()
    ocsvm.fit(X_train[:min(ocsvm_max_train, n_samples_train - 1)])
    s_X_ocsvm = ocsvm.decision_function(X_test).reshape(1, -1)[0]
    s_unif_iforest = iforest.decision_function(unif)
    s_unif_lof = lof.decision_function(unif)
    s_unif_ocsvm = ocsvm.decision_function(unif).reshape(1, -1)[0]
    plt.subplot(121)
    auc_iforest, em_iforest, amax_iforest = em(t, t_max,
                                               volume_support,
Example #48
0
def get_features(x):
	return resnet_model.predict(x)

from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import IsolationForest
from sklearn import svm

# Apply standard scaler to output from resnet50
ss = StandardScaler()
ss.fit(X_train)
X_train = ss.transform(X_train)
X_test = ss.transform(X_test)

# Take PCA to reduce feature space dimensionality
pca = PCA(n_components=512, whiten=True)
pca = pca.fit(X_train)
print('Explained variance percentage = %0.2f' % sum(pca.explained_variance_ratio_))
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)

# Train classifier and obtain predictions for OC-SVM
oc_svm_clf = svm.OneClassSVM(gamma=0.001, kernel='rbf', nu=0.08)  # Obtained using grid search
if_clf = IsolationForest(contamination=0.08, max_features=1.0, max_samples=1.0, n_estimators=40)  # Obtained using grid search

oc_svm_clf.fit(X_train)
if_clf.fit(X_train)

oc_svm_preds = oc_svm_clf.predict(X_test)
if_preds = if_clf.predict(X_test)
Example #49
0
File: qc.py Project: ekunnii/APPIAN
def _IsolationForest(X):
    rng = np.random.RandomState(42)
    clf = IsolationForest(max_samples=X.shape[0], random_state=rng)
    return clf.fit(X).predict(X)
                print "ERROR"
                sys.exit(-1)

    Xtrain.append(column_train)
    Xtest.append(column_test)

Xtrain = np.transpose(np.array(Xtrain))
Xtest = np.transpose(np.array(Xtest))
idx_train = idx_train[:Xtrain.shape[0]]
idx_test = idx_test[:Xtest.shape[0]]

# fit an iforest
iforest =  IsolationForest(n_estimators=ntrees,
                           max_samples=sample_frac, max_features=feat_frac,
                           n_jobs=-1, random_state=rng, verbose=1)
iforest.fit(Xtrain)

# anomaly scores
y_pred_train = iforest.predict(Xtrain)
y_pred_test = iforest.predict(Xtest)
train_feature_values = [(gid, val)
                        for gid, val in zip(idx_train, list(y_pred_train))]
test_feature_values = [(gid, val)
                        for gid, val in zip(idx_test, list(y_pred_test))]
for i, scenario in enumerate(MALICIOUS_SCENARIOS):
    all_feature_values = train_feature_values + \
                         [(gid, feat_value)
                          for gid, feat_value in test_feature_values
                          if gid/100 in BENIGN_SCENARIOS or
                             gid/100 == scenario]
    all_values = np.array([feat_value
Example #51
0
    #ocsvm = OneClassSVM()
    #ocsvm = OneClassSVM(kernel='linear', degree=2, gamma='auto', nu=0.5)
    ocsvm = OneClassSVM(gamma='auto', nu=0.01)

    lim_inf = X.min(axis=0)
    lim_sup = X.max(axis=0)
    volume_support = (lim_sup - lim_inf).prod()
    t = np.arange(0, 100 / volume_support, 0.01 / volume_support)
    axis_alpha = np.arange(alpha_min, alpha_max, 0.0001)
    unif = np.random.uniform(lim_inf, lim_sup,
                             size=(n_generated, n_features))

    # fit:
    print('IsolationForest processing...')
    iforest = IsolationForest()
    iforest.fit(X_train)
    s_X_iforest = iforest.decision_function(X_train)
    print('LocalOutlierFactor processing...')
    lof.fit(X_train)
    s_X_lof = lof.decision_function(X_train)

    print('OneClassSVM processing...')
    ocsvm.fit(X_train)
    s_X_ocsvm = ocsvm.decision_function(X_train).reshape(1, -1)[0]
    
    s_unif_iforest = iforest.decision_function(unif)
    s_unif_lof = lof.decision_function(unif)
    s_unif_ocsvm = ocsvm.decision_function(unif).reshape(1, -1)[0]
    plt.subplot(121)
    
    print("t ist: " ,t)
    featureMatrix['is_train'] = np.random.uniform(0, 1, len(featureMatrix)) <= .75

    #split out the train and test df's into separate objects
    train, test = featureMatrix[featureMatrix['is_train']==True], featureMatrix[featureMatrix['is_train']==False]

    #drop the is_train column, we don't need it anymore
    train = train.drop('is_train', axis=1)
    test = test.drop('is_train', axis=1)

    #create the isolation forest class and factorize the class column
    clf = IsolationForest(n_estimators=opts.numtrees)


    #train the isolation forest on the training set, dropping the class column (since the trainer takes that as a separate argument)
    print('\nTraining')
    clf.fit(train.drop('class', axis=1))

    #remove the 'answers' from the test set
    testnoclass = test.drop('class', axis=1)

    print('\nPredicting (class 1 is normal, class -1 is malicious)')

    #evaluate our results on the test set.
    test.is_copy = False
    test['prediction'] = clf.predict(testnoclass)
    print

    #group by class (the real answers) and prediction (what the forest said). we want these values to match for 'good' answers
    results=test.groupby(['class', 'prediction'])
    resultsagg = results.size()
    print(resultsagg)