def execute_ee(df, **kwargs): """ Elliptic Envelope を実行し、異常かどうかのラベルを返す。 Parameters ---------- df : pandas.DataFrame of shape (n_objects, n_features) m_ap30 の表。 Other Parameters ---------------- 以下、詳細は https://scikit-learn.org/stable/modules/generated /sklearn.covariance.EllipticEnvelope.html を参照。 support_fraction : float, default 0.9 contamination : float, default 0.01 random_state : int or RandomState instance, default 42 Returns ------- y_pred : numpy.ndarray 異常かどうかのラベル。 """ support_fraction = kwargs.get('support_fraction', 0.9) contamination = kwargs.get('contamination', 0.01) random_state = kwargs.get('random_state', 42) clf = EllipticEnvelope(support_fraction=support_fraction, contamination=contamination, random_state=random_state) y_pred = clf.fit_predict(df) return y_pred
def ee_outliers(col_name): ee = EllipticEnvelope() ee_preds = ee.fit_predict(np.array(df_processed[col_name]).reshape(-1,1)) ee_preds_class = ["ok" if i == 1 else "ee_outlier" for i in ee_preds] df_processed["ee_outlier"] = ee_preds_class
def DetectOutliersUsingEnvelope(self): data = self.__df[[self.__x, self.__y]].values clf = EllipticEnvelope() x_min_value, x_max_value = min( self.__df[self.__x].values) - self.__factor, max( self.__df[self.__x].values) + self.__factor y_min_value, y_max_value = min( self.__df[self.__y].values) - self.__factor, max( self.__df[self.__y].values) + self.__factor xx, yy = np.meshgrid(np.linspace(x_min_value, x_max_value, 500), np.linspace(y_min_value, y_max_value, 500)) clf.fit(data) Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) pred = clf.fit_predict(data) #Outliers = -1, inliers = 1 if self.__drop_outliers: # Let's drop outliers from dataset! for index, outlier in enumerate(pred): if outlier == -1: self.__df = self.__df.drop(index, axis=0) return self.__df else: return xx, yy, Z
def outlier_detector(df, percentage=0.01): model_detection = EllipticEnvelope( # contamination=percentage # outlier percentage. If you want to add manually random_state=0) predict_x = model_detection.fit_predict( df) # prediction [1 or -1] # -1 are outliers outlier = np.where(predict_x == -1, 1, 0) # convert to 1 or 0 # work as boolean # Add outlier column in DF df.loc[df.index, "outlier"] = outlier df["outlier"] = df["outlier"].astype("bool") # print summary df_check = df.copy() df_check["count"] = 1 df_check = df_check.groupby(["outlier"])["count"].count() df_check = df_check.to_frame() sum_total = df_check.iloc[:, 0].sum() # add results in percentage of total df_check["%_of_Total"] = df_check.iloc[:, 0].apply( lambda x: x / sum_total) # .astype(str) + '%' print(df_check) df_no_outlier = df[df["outlier"] == False] df_no_outlier = df_no_outlier.iloc[:, :-1] print( f"\nAmount of Outliers Removed: {len(df.index) - len(df_no_outlier.index)}" ) return df_no_outlier
def main(): ''' The procedure contains two simple steps: - Scale the data to the standard distribution with mean 0 and unit variance. This might be too simplistic. - Apply the elliptic envelope. The contamination level is set manually. ''' domains = [] raw = [] with open(sys.argv[1]) as fhandle: for line in fhandle: record = json.loads(line.strip()) for analyser in record['analysers']: if analyser['analyser'] == 'FeaturesGenerator': raw.extend(analyser['output']) if analyser['analyser'] == 'WordSegmentation': domains.extend(analyser['output'].keys()) if len(raw) != len(domains): print(record) sys.exit(0) x_samples = scale(np.array(raw)) engine = EllipticEnvelope(contamination=0.015, support_fraction=1.0) y_samples = engine.fit_predict(x_samples) for index, y_sample in enumerate(y_samples): if y_sample == -1: print(domains[index])
def outliers_EllipticEnvelope(df, contamination): dataset = df.copy() clf = EllipticEnvelope(contamination=contamination) df_with_ellip = dataset.join(pd.DataFrame(clf.fit_predict(dataset), index=dataset.index, columns=['elliptic']), how='left') return df_with_ellip.loc[df_with_ellip['elliptic'] != 1].index
def elliptic_envelope(df, modelDir, norm_confidence=0.95): from sklearn.covariance import EllipticEnvelope from scipy.stats import normaltest if "ds" in df.columns: del df["ds"] model = EllipticEnvelope() test_stats, p_vals = normaltest(df.values, axis=0) normal_cols = p_vals >= (1 - norm_confidence) df = df.loc[:, normal_cols] if df.shape[1] == 0: return None df.outlier = model.fit_predict(df.values) df.outlier = df.outlier < 0 # 1 if inlier, -1 if outlier return df
def elliptic_envelope(X_train, y_names): cov = EllipticEnvelope(contamination=0.05) y_pred = cov.fit_predict(X_train) plot(X_train, y_pred, "Covariance") outlier_list = [] for i in range(0, len(y_pred)): if y_pred[i] == -1: outlier_list.append(i) print("Covariance") for i in range(0, len(outlier_list)): print(outlier_list[i], y_names[outlier_list[i]]) print("Number of outliers:", len(outlier_list))
def detect_anomoly(): firebase = firebase.FirebaseApplication( 'https://hackmit-df9ea.firebaseio.com', None) result = firebase.get('/Test/Double', None) items = result.items() result = np.array(list(result.values())[-60:]).reshape(-1, 1) # normalize result from sklearn.preprocessing import MinMaxScaler scaler = MinMaxScaler() result2 = scaler.fit_transform(result) from sklearn.covariance import EllipticEnvelope clf = EllipticEnvelope(contamination=.3) y_Pred = clf.fit_predict(result2) an_outlier = len(list(filter(lambda x: (x < 0), y_Pred))) > 0 if clf.precision_ > 100 and an_outlier: return True return False
def filter_outliers(points, n_estimators=100, contamination=0.05, type='isolation_forest'): if type == 'elliptic': clf = EllipticEnvelope(contamination=contamination) elif type == 'isolation_forest': clf = IsolationForest(n_estimators=n_estimators, contamination=contamination) else: raise ValueError("Unknown outlier filter type") y = clf.fit_predict(points.numpy()) y = torch.tensor(y) num_valid = (y > 0).sum() num_filtered = (y <= 0).sum() logger.info('filtered points', num_filtered=num_filtered.item(), num_valid=num_valid.item()) return y > 0
def get_outlyingness(data, contamination=0.1): """ Outlier detection from covariance estimation in a Gaussian distributed dataset. :param data: Data in which to detect outliers. Take care that n_samples > n_features ** 2 . :type data: pandas.DataFrame :param contamination: The amount of contamination of the data set, i.e. the proportion of outliers in the data set. Range is (0, 0.5). :type contamination: float :returns: Decision on each row if it's an outlier. And contour array for drawing ellipse in graph. :rtype: tuple[numpy.ndarray, numpy.ndarray] """ robust_cov = EllipticEnvelope(support_fraction=1., contamination=contamination) outlyingness = robust_cov.fit_predict(data) decision = (outlyingness-1).astype(bool) # Visualisation. xx, yy = np.meshgrid(np.linspace(0, 100, 101), np.linspace(0, 100, 101)) z = robust_cov.predict(np.c_[xx.ravel(), yy.ravel()]) z = z.reshape(xx.shape) return decision, z
code='Cluster_LOF_Labels', title='Cluster Distribution') LOF_G2_Bar = bar_cluster(df=df_X_train_std, x='Cluster_LOF_Labels', code='Cluster_LOF_Labels', title='Cluster Distribution') LOF_G3_Bar = bar_cluster(df=df_X_train_std, x='Cluster_LOF_Labels', code='Cluster_LOF_Labels', title='Cluster Distribution') ## EE EE = EllipticEnvelope(random_state=10, contamination=0.0058).fit(X_train_std) EE_labels = EE.fit_predict(X_train_std) ## 3D Plot of Training Data # Create and modify dataframe for the cluster column df_X_train_std = pd.DataFrame(X_train_std) df_X_train_std['Cluster_EE'] = pd.Series(EE_labels, index=df_X_train_std.index) # Rename Cluster label names from EE cluster_label_names = {1: "Human", -1: "Hacker"} df_X_train_std['Cluster_EE_Labels'] = df_X_train_std['Cluster_EE'].map( cluster_label_names) df_X_train_std.columns = [ 'Kill Death Ratio', "Headshot Kill Ratio", 'Win Ratio', "Top 10 Ratio", 'Cluster_EE', 'Cluster_EE_Labels' ]
def establish_model(self): ##更改按钮状态 self.new_model.setEnabled(False) self.new_model.setText('训练中') conn = pymysql.connect(host='localhost', port=3306, user='******', password='', database='resistance') cs1 = conn.cursor() all_character = pd.DataFrame() cs1.execute('use resistance') print(self.model_data_num.currentText().split(':')) cs1.execute("select * from {}_{} order by id desc limit {}".format( datetime.datetime.now().year, datetime.datetime.now().month, int(self.model_data_num.currentText().split(':')[1]))) columns_name = cs1.fetchall() cs1.close() character_data = pd.DataFrame(np.array(columns_name)).iloc[:, -12:] print(character_data.shape) now_time = datetime.datetime.now() os.makedirs(os.getcwd() + '\\model\\{}'.format( str(now_time.year) + '-' + str(now_time.month) + '-' + str(now_time.day) + ' ' + str(now_time.hour) + '_' + str(now_time.minute) + '_' + str(now_time.second))) self.new_model_dir = str(now_time.year) + '-' + str( now_time.month) + '-' + str(now_time.day) + ' ' + str( now_time.hour) + '_' + str(now_time.minute) + '_' + str( now_time.second) ##使用LOF进行预测 lof = LocalOutlierFactor( novelty=True, n_neighbors=10, contamination=0.028) # LOF模型实例化,0.028表示异常点在所有数据点中所占的比率 # error_lof_index = data_ana[clf.fit_predict(data_ana) == -1].index #将LOF模型判断为异常的点的索引列出 lof_result = lof.fit(character_data) with open( os.getcwd() + '\\model\\{}\\lof.model'.format( str(now_time.year) + '-' + str(now_time.month) + '-' + str(now_time.day) + ' ' + str(now_time.hour) + '_' + str(now_time.minute) + '_' + str(now_time.second)), 'wb') as f: pickle.dump(lof, f) ##使用孤立森林进行预测 ilf = IsolationForest(max_samples=100, random_state=42, n_estimators=200, contamination=0.028) # 模型实例化 ilf_result = ilf.fit_predict(character_data) with open( os.getcwd() + '\\model\\{}\\ilf.model'.format( str(now_time.year) + '-' + str(now_time.month) + '-' + str(now_time.day) + ' ' + str(now_time.hour) + '_' + str(now_time.minute) + '_' + str(now_time.second)), 'wb') as f: pickle.dump(ilf, f) ##使用elf进行预测 elf = EllipticEnvelope(support_fraction=1, contamination=0.028) elf_result = elf.fit_predict(character_data) with open( os.getcwd() + '\\model\\{}\\elf.model'.format( str(now_time.year) + '-' + str(now_time.month) + '-' + str(now_time.day) + ' ' + str(now_time.hour) + '_' + str(now_time.minute) + '_' + str(now_time.second)), 'wb') as f: pickle.dump(elf, f) self.new_model_msg.emit('ok') ##更改按钮状态 self.new_model.setEnabled(True) self.new_model.setText('开始训练') self.st.stop_thread(self.sever_establish_model_th)
def plot_raw_overview(filename): event_type = 'all' if filename.name.startswith('sub-drouwen'): CHANS = [f'IH0{x + 1}' for x in range(8)] elif filename.name.startswith('sub-itens'): CHANS = [f'C0{x + 1}' for x in range(8)] elif filename.name.startswith('sub-lemmer'): CHANS = [f'IH{x + 1}' for x in range(8)] elif filename.name.startswith('sub-som705'): CHANS = [f'GA0{x + 1}' for x in range(8)] # a bit random elif filename.name.startswith('sub-ommen'): CHANS = ['chan1', 'chan2'] # I dont 'understand why I cannot use 'chan64' elif filename.name.startswith('sub-vledder') or filename.name.startswith( 'sub-ommen'): CHANS = ['chan1', 'chan64'] elif '_acq-blackrock_' in filename.name: CHANS = ['chan1', 'chan128'] else: print('you need to specify reference channel for this test') return None, None d = Dataset(filename, bids=True) event_names, event_onsets = select_events(d, event_type) is_ecog = d.dataset.task.channels.tsv['type'] == 'ECOG' is_seeg = d.dataset.task.channels.tsv['type'] == 'SEEG' chans = array(d.header['chan_name'])[is_ecog | is_seeg] data = d.read_data(begtime=event_onsets[0], endtime=event_onsets[-1], chan=list(chans)) data.data[0][isnan(data.data[0])] = 0 # ignore nan data = montage(data, ref_chan=CHANS) freq = frequency(data, taper='hann', duration=2, overlap=0.5) hist = make_histogram(data, max=250, step=10) divs = [] fig = plot_hist(hist) divs.append(to_div(fig)) bad_chans = None if AUTOMATIC: from sklearn.covariance import EllipticEnvelope algorithm = EllipticEnvelope( contamination=P['data_quality']['histogram']['contamination']) prediction = algorithm.fit(hist.data[0]).predict(hist.data[0]) new_bad_chans = data.chan[0][prediction == -1] print('bad channels with histogram / elliptic envelope: ' + ', '.join(new_bad_chans)) bad_chans = set(new_bad_chans) fig = plot_outliers(hist.chan[0], algorithm.dist_, prediction, yaxis_title='distance', yaxis_type='log') divs.append(to_div(fig)) fig = plot_freq(freq) divs.append(to_div(fig)) if AUTOMATIC: from sklearn.neighbors import LocalOutlierFactor algorithm = LocalOutlierFactor( n_neighbors=P['data_quality']['spectrum']['n_neighbors']) prediction = algorithm.fit_predict(freq.data[0]) new_bad_chans = data.chan[0][prediction == -1] print('bad channels with spectrum / local outlier factor: ' + ', '.join(new_bad_chans)) bad_chans |= set(new_bad_chans) fig = plot_outliers(freq.chan[0], algorithm.negative_outlier_factor_, prediction, yaxis_title='distance', yaxis_type='linear') divs.append(to_div(fig)) # we use again the reference channel. Ref channel was handpicked but it might have a weird spectrum bad_chans -= set(CHANS) return bad_chans, divs
import numpy as np import matplotlib.pyplot as plt from scipy.io import loadmat from sklearn.covariance import EllipticEnvelope from sklearn.ensemble import IsolationForest from sklearn.neighbors import LocalOutlierFactor from sklearn.decomposition import PCA data = loadmat('ex8data2.mat') X = data['X'] e1 = EllipticEnvelope() labels1 = e1.fit_predict(X) e2 = LocalOutlierFactor() labels2 = e2.fit_predict(X) n_components = 3 pca1 = PCA(n_components=n_components) Xproj = pca1.fit_transform(X) plt.figure() plt.clf() ax = plt.axes(projection='3d') # ax.scatter(image_array[:, 0], image_array[:, 1], image_array[:, 2], c=labels, cmap='coolwarm', marker=',') ax.scatter(Xproj[:, 0], Xproj[:, 1], Xproj[:, 2], marker='o', c=labels1)
SVM = svm.OneClassSVM(gamma='auto') detected_results_SVM = SVM.fit_predict(featureData) outliers_SVM = [] for i in range(len(detected_results_SVM)): if detected_results_SVM[i] < 0: outliers_SVM.append(i) SVM_data = np.delete(featureData, (outliers_SVM), axis=0) output_SVM = np.delete(outputLabels, (outliers_SVM), axis=0) # Elliptic envelope EE = EllipticEnvelope() detected_results_EE = EE.fit_predict(featureData) outliers_EE = [] for i in range(len(detected_results_EE)): if detected_results_EE[i] < 0: outliers_EE.append(i) EE_data = np.delete(featureData, (outliers_EE), axis=0) output_EE = np.delete(outputLabels, (outliers_EE), axis=0) # Compare the outlier results accuracy_scores = {} model0 = svm.SVC() model0.fit(IS_data[0:int(len(IS_data) / 2)], output_IS[0:int(len(IS_data) / 2)])
def elliptEnvMethod(data, uniqueTrains, **kwargs): elp = EllipticEnvelope(support_fraction=1) elp.fit_predict(data) # Squared Mahalanobis distances of the points of data # Note this is the same as using the "elp.dist_" parameter m_d = elp.mahalanobis(data) # Get the regular Mahalanobis distances elp_d = np.sqrt(m_d) # IMPLEMENT THE AUTOMATED CUT-OFF SCORE_INCREASE_RATIO = 1.3 sortD = np.sort(elp_d) sortD = sortD[math.floor( len(sortD) / 2):] # Get the end half of the sorted list of scores ratioD = np.array([sortD[i] / sortD[i - 1] for i in range(1, len(sortD))]) # print(f'\nSorted distances: {sortD}\n\n Ratios: {ratioD}') ind = np.where(ratioD > SCORE_INCREASE_RATIO) if len(ind[0]) >= 1: ind = ind[0][0] + 1 SIGMA = sortD[ ind] # Get the score which increases by the score_ratio compared to the previous score else: SIGMA = 100.0 # use an arbritary high score as there are no big score jumps SIGMA = max(SIGMA, 4.0) # Limit the SIGMA function from being too low # Segment labels = (elp_d >= SIGMA).astype(int) # labelOLD = (elp_d > 4.0).astype(int) if False: print('.dist_ = {m1}\t .m(data)={}'.format(m2)) #\n {m_d}') print("Sigma labels: {}".format(labels)) print("\nCovariance = {}".format(elp.covariance_)) if False: labelColours = 'white' fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(20, 6)) colours = np.array(['blue', 'red']) ax[0].set_title('Elliptical envelope - Mahalanobis distance', color='white') ax[0].plot(elp_d, 'bo', alpha=0.4) #, color=colours[labels]) ax[1].scatter(data[:, 0], data[:, 1], s=20, color=colours[labels], alpha=0.4) ax[1].set_title( 'Elliptical envelope (adjusted cutoff={})'.format(SIGMA), color='white') # ax[2].scatter(data[:, 0], data[:, 1], s=20, color=colours[labelOLD], alpha=0.4) # ax[2].set_title('Elliptical envelope (adjusted cutoff={})'.format(4.0), color='white') for i, a in enumerate(ax.flat): ax[i].set_xlabel('Mean normalised') #, fontsize = 15.0) ax[i].set_ylabel('Standard deviation normalised') # fig1 = plt.figure(figsize=(5,5)) plt.show() anomalies = [ train for ind, train in enumerate(uniqueTrains) if labels[ind] == 1 ] anomalyDates = [] if kwargs.get('dates', None) is not None: dates = kwargs.get('dates') anomalyDates = [dates[i] for i, val in enumerate(labels) if val == 1] return anomalies, labels, anomalyDates
def findAnomalies(self, saveChart=False, saveEvaluation=False): outliers_fraction = 0.15 clf = EllipticEnvelope(contamination=outliers_fraction) predicted_outlier = [] list_of_df = self.dataCollector.getWithAnomaly() for df in list_of_df: if df.shape[0] > 0: data = df.drop(['anomaly', 'changepoint'], axis=1) self.st_tr_time.append(datetime.datetime.now().timestamp()) prediction = pd.Series(clf.fit_predict(data) * -1, index=df.index) \ .rolling(5) \ .median() \ .fillna(0).replace(-1, 0) self.en_tr_time.append(datetime.datetime.now().timestamp()) # predicted outliers saving predicted_outlier.append(prediction) df['rocov_anomaly'] = prediction true_outlier = [df.anomaly for df in list_of_df] if saveChart: for i in range(len(predicted_outlier)): plt.figure() plt.rcParams["font.family"] = "Times New Roman" csfont = {'fontname': 'Times New Roman'} plt.xlabel('Time', **csfont) plt.ylabel('Value', **csfont) plt.title('Robust covariance On File [{}]'.format(i + 1), **csfont) predicted_outlier[i].plot(figsize=(12, 6), label='predictions', marker='o', markersize=5) true_outlier[i].plot(marker='o', markersize=2) # data = list_of_df[i] # plt.scatter(x=data[data['rocov_anomaly'] == data['anomaly']].index, # y=data[data['rocov_anomaly'] == data['anomaly']]['anomaly'], label='True Prediction' # , c='g', zorder=4) # plt.scatter(x=data[data['rocov_anomaly'] != data['anomaly']].index, # y=data[data['rocov_anomaly'] != data['anomaly']]['anomaly'], label='False Prediction' # , c='r', zorder=5) plt.legend(loc='upper right') plt.savefig(self.path_to_plt + 'anom/rocov-pre-{}.png'.format(i + 1), format='png') print('Chart {} is Generated'.format(i + 1)) plt.clf() plt.close('all') if saveChart: ts = 1 for df in list_of_df: data = df.drop(['anomaly', 'changepoint'], axis=1) pc = PCA(n_components=2).fit_transform(data) df[['X', 'Y']] = pc plt.figure() sb.set(font='Times New Roman') sns = sb.scatterplot(data=df, x='X', y='Y', hue='rocov_anomaly', palette='bright') sns.set_title( 'The Anomaly Detected By Robust covariance, File {}'. format(ts)) sns.figure.savefig(self.path_to_plt + 'chart/chart-{}.png'.format(ts)) plt.close('all') print('The Chart of File {} is Generated.'.format(ts)) ts += 1 if saveEvaluation: evaluator = Evaluator(true_outlier, predicted_outlier, metric='binary', numenta_time='30 sec') metrics = evaluator.getConfusionMetrics() TP = metrics['TP'] TN = metrics['TN'] FP = metrics['FP'] FN = metrics['FN'] print('\n-----------------------------------------------------') print('Robust covariance Outputs: ') print(f'\t False Alarm Rate: {round(FP / (FP + TN) * 100, 2)} %') print(f'\t Missing Alarm Rate: {round(FN / (FN + TP) * 100, 2)} %') print( f'\t Accuracy Rate: {round((TP + TN) / (TP + TN + FN + FP) * 100, 2)} %' ) trainTime = np.array(self.en_tr_time).sum() - np.array( self.st_tr_time).sum() print(f'\t Train & Train Time {round(trainTime, 2)}s') data = { 'far': round(FP / (FP + TN) * 100, 2), 'mar': round(FN / (FN + TP) * 100, 2), 'acc': round((TP + TN) / (TP + TN + FN + FP) * 100, 2), 'tr': trainTime, 'te': 0, 'tp': TP, 'tn': TN, 'fp': FP, 'fn': FN } output = OutputWriter(self.path_to_plt, 'RobustCov', data) output.write()
model4 = LocalOutlierFactor(n_neighbors=200, algorithm="brute", leaf_size=200, contamination=0.1) # fix # model5 = DBSCAN() model6 = EllipticEnvelope( contamination=0.10, random_state=100, support_fraction=0.1) # fix # model fitting outlier detection # print("====== OUTLIER DETECTION =======") X_train_pred2, X_test_pred2 = model2.fit_predict( df_X_train), model2.fit_predict(df_X_test) X_train_pred3, X_test_pred3 = model3.fit_predict( df_X_train), model3.fit_predict(df_X_test) X_train_pred4, X_test_pred4 = model4.fit_predict( df_X_train), model4.fit_predict(df_X_test) # y_pred5 = model5.fit_predict(df) X_train_pred6, X_test_pred6 = model6.fit_predict( df_X_train), model6.fit_predict(df_X_test) # print("====== NOVELTY DETECTION =======") # model2.fit(df_X_train), model2.fit(df_X_test) # novelty_X_train_pred2, novelty_X_test_pred2 = model2.predict( # df_X_train), model2.predict(df_X_test) model2.fit(df_X_train) novelty_X_train_pred2 = model2.predict(df_X_test) # print("X_train_pred2 bbbbb: ", X_train_pred2) # print("novelty_X_train_pred2 bbbbb: ", novelty_X_train_pred2) # with np.printoptions(threshold=np.inf): # print("X_train_pred2 shape: ", X_train_pred2.size)
from sklearn.covariance import EllipticEnvelope from sklearn.metrics import mean_absolute_error # load the dataset url = 'https://raw.githubusercontent.com/jbrownlee/Datasets/master/housing.csv' df = read_csv(url, header=None) # retrieve the array data = df.values # split into input and output elements X, y = data[:, :-1], data[:, -1] # split into train and test sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1) # summarize the shape of the training dataset print(X_train.shape, y_train.shape) # identify outliers in the training dataset ee = EllipticEnvelope(contamination=0.01) yhat = ee.fit_predict(X_train) # select all rows that are not outliers mask = yhat != -1 X_train, y_train = X_train[mask, :], y_train[mask] # summarize the shape of the updated training dataset print(X_train.shape, y_train.shape) # fit the model model = LinearRegression() model.fit(X_train, y_train) # evaluate the model yhat = model.predict(X_test) # evaluate predictions mae = mean_absolute_error(y_test, yhat) print('MAE: %.3f' % mae)
def getRobustCovairance(_df): clf = EllipticEnvelope(contamination=OUTLIER_FRACTION) return clf.fit_predict(_df)
_temp = np.ceil(data['Fare'].quantile(0.75) + (IQR * 3)) data_processed.loc[data_processed.Fare > _temp, 'Fare'] = _temp X_train_processed, X_test_processed, y_train_processed, y_test_processed = train_test_split( data_processed[['Age', 'Fare']].fillna(0), data_processed['Survived'], test_size=0.2) from sklearn.covariance import EllipticEnvelope df_outliers = data.copy() df_outliers = df_outliers.fillna(0) column_name = 'Fare' obj = EllipticEnvelope() _temp = obj.fit_predict(df_outliers[[column_name]]) print(np.unique(_temp, return_counts=True)) central = df_outliers[_temp==1][column_name].mean() max_val = df_outliers[_temp==1][column_name].max() min_val = df_outliers[_temp==1][column_name].min() df_outliers.loc[_temp==-1,[column_name]] = df_outliers.loc[_temp==-1,[column_name]].apply(lambda x: [max_val if y > central else y for y in x]) df_outliers.loc[_temp==-1,[column_name]] = df_outliers.loc[_temp==-1,[column_name]].apply(lambda x: [min_val if y < central else y for y in x]) print(data.shape) print(df_outliers.shape) column_name = 'Age' obj = EllipticEnvelope() _temp = obj.fit_predict(df_outliers[[column_name]]) print(np.unique(_temp, return_counts=True)) central = df_outliers[_temp==1][column_name].mean() max_val = df_outliers[_temp==1][column_name].max()
def minimum_covariance_determinant(X_train): # identify outliers in the training dataset ee = EllipticEnvelope(contamination=0.01) yhat = ee.fit_predict(X_train) return yhat
def get_gauss(db: pd.DataFrame) -> list: ee = EllipticEnvelope(contamination=0.01) yhat_gaus = ee.fit_predict(db) return yhat_gaus == -1
import numpy as np import matplotlib.pyplot as plt from scipy.io import loadmat from sklearn.covariance import EllipticEnvelope from sklearn.ensemble import IsolationForest from sklearn.neighbors import LocalOutlierFactor data = loadmat('ex8data1.mat') X = data['X'] plt.figure() estimator = EllipticEnvelope(contamination=.015) labels = estimator.fit_predict(X) e1 = IsolationForest() labels1 = e1.fit_predict(X) xx, yy = np.meshgrid(np.linspace(min(X[:, 0]), max(X[:, 0]), 150), np.linspace(min(X[:, 1]), max(X[:, 1]), 150)) # Z = estimator.predict(np.c_[xx.ravel(), yy.ravel()]) # Z = Z.reshape(xx.shape) # plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='black', alpha=0.5) # # Z1 = e1.predict(np.c_[xx.ravel(), yy.ravel()]) # Z1 = Z1.reshape(xx.shape) # plt.contour(xx, yy, Z1, levels=[0], linewidths=2, colors='red', alpha=0.2)