def outlier_removal2(features, samples, cv_predict): outliers_fraction = 0.1 print cv_predict.shape print samples.shape test = np.column_stack((cv_predict, samples)) #clf = EllipticEnvelope(contamination=.1) clf = EllipticEnvelope(contamination=.1) #clf = svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05, # kernel="rbf", gamma=0.1) clf.fit(test) y_pred = clf.decision_function(test).ravel() threshold = stats.scoreatpercentile(y_pred, 100 * outliers_fraction) y_pred_new = y_pred > threshold print y_pred_new #print samples[y_pred_new] print samples.shape print samples[y_pred_new].shape print features.shape print features[y_pred_new].shape return features[y_pred_new], samples[y_pred_new]
def clean_series(self, token, discard=5): """ Remove outliers from the ratio series for a token. Args: discard (int): Drop the most outlying X% of the data. Returns: OrderedDict{year: wpm} """ series = self.ratios[token] X = np.array(list(series.values()))[:, np.newaxis] env = EllipticEnvelope() env.fit(X) # Score each data point. y_pred = env.decision_function(X).ravel() # Get the discard threshold. threshold = stats.scoreatpercentile(y_pred, discard) return OrderedDict([ (year, ratio) for (year, ratio), pred in zip(series.items(), y_pred) if pred > threshold ])
def calc(self,outliers_fraction): data, dqs, raw = self.get_data() clf = EllipticEnvelope(contamination=outliers_fraction) X = zip(data['Tbandwidth'],data['Tlatency'],data['Tframerate']) clf.fit(X) #data['y_pred'] = clf.decision_function(X).ravel() #data['y_pred'] = clf.decision_function(X).ravel() #threshold = np.percentile(data['y_pred'],100 * outliers_fraction) data['MDist']=clf.mahalanobis(X) #picking "bad" outliers, not good ones outliers = chi2_outliers(data, [.8,.9,.95], 3) #print outliers outliers = [i[i['Tbandwidth']<i['Tlatency']] for i in outliers] #outliers = data[data['y_pred']<threshold] #data['y_pred'] = data['y_pred'] > threshold #outliers = [x[['ticketid','MDist']].merge(raw, how='inner').drop_duplicates() for x in outliers] #print raw #outliers = [raw[raw['ticketid'].isin(j['ticketid'])] for j in outliers] outliers = [k[k['Tframerate']<(k['Tframerate'].mean()+k['Tframerate'].std())] for k in outliers] #making sure we don't remove aberrantly good framrates outliers = [t.sort_values(by='MDist', ascending=False).drop_duplicates().drop(['Tbandwidth','Tlatency','Tframerate'],axis=1) for t in outliers] #dqs = raw[raw['ticketid'].isin(dqs['ticketid'])] #data = data.sort_values('MDist', ascending=False).drop_duplicates() return outliers, dqs, data.sort_values(by='MDist', ascending=False).drop_duplicates().drop(['Tbandwidth','Tlatency','Tframerate'],axis=1)
def filter_remove_outlayers(self, flat, minimum_value=0): """ Remove outlayers using ellicptic envelope from scikits learn :param flat: :param minimum_value: :return: """ from sklearn.covariance import EllipticEnvelope flat0 = flat.copy() flat0[np.isnan(flat)] = 0 x,y = np.nonzero(flat0) # print np.prod(flat.shape) # print len(y) z = flat[(x,y)] data = np.asarray([x,y,z]).T clf = EllipticEnvelope(contamination=.1) clf.fit(data) y_pred = clf.decision_function(data) out_inds = y_pred < minimum_value flat[(x[out_inds], y[out_inds])] = np.NaN return flat
def model_2_determine_test_data_similarity(self,model): clf_EE={} model_EE={} for i in range(len(model)): clf=EllipticEnvelope(contamination=0.01,support_fraction=1) clf_EE[i]=clf EEmodel=clf.fit(model[i]) model_EE[i]=EEmodel return clf_EE,model_EE
def plot(X, y): proj = TSNE().fit_transform(X) e = EllipticEnvelope(assume_centered=True, contamination=.25) # Outlier detection e.fit(X) good = np.where(e.predict(X) == 1) X = X[good] y = y[good] scatter(proj, y)
def filterOut(x): x = np.array(x) outliers_fraction=0.05 #clf = svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05, kernel="rbf", gamma=0.1) clf = EllipticEnvelope(contamination=outliers_fraction) clf.fit(x) y_pred = clf.decision_function(x).ravel() threshold = stats.scoreatpercentile(y_pred, 100 * outliers_fraction) y_pred = y_pred > threshold return y_pred
def module4(self): ''' 入力された一次元配列からanomaly detectionを用いて外れ値を検出する ''' # get data img = cv2.imread('../saliency_detection/image/pearl.png') b,g,r = cv2.split(img) B,G,R = map(lambda x,y,z: x*1. - (y*1. + z*1.)/2., [b,g,r],[r,r,g],[g,b,b]) Y = (r*1. + g*1.)/2. - np.abs(r*1. - g*1.)/2. - b*1. # 負の部分は0にする R[R<0] = 0 G[G<0] = 0 B[B<0] = 0 Y[Y<0] = 0 rg = cv2.absdiff(R,G) by = cv2.absdiff(B,Y) img1 = rg img2 = by rg, by = map(lambda x:x.reshape((len(b[0])*len(b[:,0]),1)),[rg,by]) data = np.hstack((rg,by)) data = data.astype(np.float64) data = np.delete(data, range( 0,len(data[:,0]),2),0) # grid xx1, yy1 = np.meshgrid(np.linspace(-10, 300, 500), np.linspace(-10, 300, 500)) # 学習して境界を求める # contamination大きくすると円は小さく clf = EllipticEnvelope(support_fraction=1, contamination=0.01) print 'data.shape =>',data.shape print 'learning...' clf.fit(data) #学習 # 0があるとだめっぽいかも print 'complete learning!' # 学習した分類器に基づいてデータを分類して楕円を描画 z1 = clf.decision_function(np.c_[xx1.ravel(), yy1.ravel()]) z1 = z1.reshape(xx1.shape) plt.contour(xx1,yy1,z1,levels=[0],linewidths=2,colors='r') # plot plt.scatter(data[:,0],data[:,1],color= 'black') plt.title("Outlier detection") plt.xlim((xx1.min(), xx1.max())) plt.ylim((yy1.min(), yy1.max())) plt.pause(.001) # plt.show() cv2.imshow('rg',img1/np.amax(img1)) cv2.imshow('by',img2/np.amax(img2))
def find_outlier_test_homes(df,all_homes, appliance, outlier_features, outliers_fraction=0.1): from scipy import stats from sklearn import svm from sklearn.covariance import EllipticEnvelope clf = EllipticEnvelope(contamination=.1) try: X = df.ix[all_homes[appliance]][outlier_features].values clf.fit(X) except: try: X = df.ix[all_homes[appliance]][outlier_features[:-1]].values clf.fit(X) except: try: X = df.ix[all_homes[appliance]][outlier_features[:-2]].values clf.fit(X) except: print "outlier cannot be found" return df.ix[all_homes[appliance]].index.tolist() y_pred = clf.decision_function(X).ravel() threshold = stats.scoreatpercentile(y_pred, 100 * outliers_fraction) y_pred = y_pred > threshold return df.ix[all_homes[appliance]][~y_pred].index.tolist()
def ellipticenvelope(data, fraction = 0.02): elenv = EllipticEnvelope(contamination=fraction) elenv.fit(data) score = elenv.predict(data) numeration = [[i] for i in xrange(1, len(data)+1, 1)] numeration = np.array(numeration) y = np.hstack((numeration, score)) anomalies = numeration for num,s in y: if (y == 1): y = np.delete(anomalies, num-1, axis=0) return anomalies
def elliptic_envelope(df, modelDir, norm_confidence=0.95): from sklearn.covariance import EllipticEnvelope from scipy.stats import normaltest if "ds" in df.columns: del df["ds"] model = EllipticEnvelope() test_stats, p_vals = normaltest(df.values, axis=0) normal_cols = p_vals >= (1 - norm_confidence) df = df.loc[:, normal_cols] if df.shape[1] == 0: return None df.outlier = model.fit_predict(df.values) df.outlier = df.outlier < 0 # 1 if inlier, -1 if outlier return df
def labelValidSkeletons(skel_file, valid_index, trajectories_data, fit_contamination = 0.05): #calculate valid widths if they were not used calculate_widths(skel_file) #calculate classifier for the outliers X4fit = nodes2Array(skel_file, valid_index) clf = EllipticEnvelope(contamination = fit_contamination) clf.fit(X4fit) #calculate outliers using the fitted classifier X = nodes2Array(skel_file) #use all the indexes y_pred = clf.decision_function(X).ravel() #less than zero would be an outlier #labeled rows of valid individual skeletons as GOOD_SKE trajectories_data['auto_label'] = ((y_pred>0).astype(np.int))*wlab['GOOD_SKE'] #+ wlab['BAD']*np.isnan(y_prev) saveLabelData(skel_file, trajectories_data)
def test_score_samples(): X_train = [[1, 1], [1, 2], [2, 1]] clf1 = EllipticEnvelope(contamination=0.2).fit(X_train) clf2 = EllipticEnvelope().fit(X_train) assert_array_equal(clf1.score_samples([[2., 2.]]), clf1.decision_function([[2., 2.]]) + clf1.offset_) assert_array_equal(clf2.score_samples([[2., 2.]]), clf2.decision_function([[2., 2.]]) + clf2.offset_) assert_array_equal(clf1.score_samples([[2., 2.]]), clf2.score_samples([[2., 2.]]))
def labelValidSkeletons(skel_file): calculate_widths(skel_file) #get valid rows using the trajectory displacement and the skeletonization success valid_index, trajectories_data = getValidIndexes(skel_file) #calculate classifier for the outliers X4fit = nodes2Array(skel_file, valid_index) clf = EllipticEnvelope(contamination=.1) clf.fit(X4fit) #calculate outliers using the fitted classifier X = nodes2Array(skel_file) y_pred = clf.decision_function(X).ravel() #less than zero would be an outlier #labeled rows of valid individual skeletons as GOOD_SKE trajectories_data['auto_label'] = ((y_pred>0).astype(np.int))*wlab['GOOD_SKE'] #+ wlab['BAD']*np.isnan(y_prev) saveLabelData(skel_file, trajectories_data)
def test_outlier_detection(): rnd = np.random.RandomState(0) X = rnd.randn(100, 10) clf = EllipticEnvelope(contamination=0.1) assert_raises(NotFittedError, clf.predict, X) assert_raises(NotFittedError, clf.decision_function, X) clf.fit(X) y_pred = clf.predict(X) decision = clf.decision_function(X, raw_values=True) decision_transformed = clf.decision_function(X, raw_values=False) assert_array_almost_equal(decision, clf.mahalanobis(X)) assert_array_almost_equal(clf.mahalanobis(X), clf.dist_) assert_almost_equal(clf.score(X, np.ones(100)), (100 - y_pred[y_pred == -1].size) / 100.0) assert sum(y_pred == -1) == sum(decision_transformed < 0)
def test_outlier_detection(): rnd = np.random.RandomState(0) X = rnd.randn(100, 10) clf = EllipticEnvelope(contamination=0.1) clf.fit(X) y_pred = clf.predict(X) assert_array_almost_equal( clf.decision_function(X, raw_values=True), clf.mahalanobis(X)) assert_array_almost_equal(clf.mahalanobis(X), clf.dist_) assert_almost_equal(clf.score(X, np.ones(100)), (100 - y_pred[y_pred == -1].size) / 100.)
def labelValidSkeletons_old(skeletons_file, good_skel_row, fit_contamination = 0.05): base_name = getBaseName(skeletons_file) progress_timer = timeCounterStr(''); print_flush(base_name + ' Filter Skeletons: Starting...') with pd.HDFStore(skeletons_file, 'r') as table_fid: trajectories_data = table_fid['/trajectories_data'] trajectories_data['is_good_skel'] = trajectories_data['has_skeleton'] if good_skel_row.size > 0: #nothing to do if there are not valid skeletons left. print_flush(base_name + ' Filter Skeletons: Reading features for outlier identification.') #calculate classifier for the outliers nodes4fit = ['/skeleton_length', '/contour_area'] + \ ['/' + name_width_fun(part) for part in worm_partitions] X4fit = nodes2Array(skeletons_file, nodes4fit, good_skel_row) assert not np.any(np.isnan(X4fit)) #%% print_flush(base_name + ' Filter Skeletons: Fitting elliptic envelope. Total time:' + progress_timer.getTimeStr()) #TODO here the is a problem with singular covariance matrices that i need to figure out how to solve clf = EllipticEnvelope(contamination = fit_contamination) clf.fit(X4fit) print_flush(base_name + ' Filter Skeletons: Calculating outliers. Total time:' + progress_timer.getTimeStr()) #calculate outliers using the fitted classifier X = nodes2Array(skeletons_file, nodes4fit) #use all the indexes y_pred = clf.decision_function(X).ravel() #less than zero would be an outlier print_flush(base_name + ' Filter Skeletons: Labeling valid skeletons. Total time:' + progress_timer.getTimeStr()) #labeled rows of valid individual skeletons as GOOD_SKE trajectories_data['is_good_skel'] = (y_pred>0).astype(np.int) #Save the new is_good_skel column saveModifiedTrajData(skeletons_file, trajectories_data) print_flush(base_name + ' Filter Skeletons: Finished. Total time:' + progress_timer.getTimeStr())
def test_elliptic_envelope(): rnd = np.random.RandomState(0) X = rnd.randn(100, 10) clf = EllipticEnvelope(contamination=0.1) assert_raises(NotFittedError, clf.predict, X) assert_raises(NotFittedError, clf.decision_function, X) clf.fit(X) y_pred = clf.predict(X) scores = clf.score_samples(X) decisions = clf.decision_function(X) assert_array_almost_equal( scores, -clf.mahalanobis(X)) assert_array_almost_equal(clf.mahalanobis(X), clf.dist_) assert_almost_equal(clf.score(X, np.ones(100)), (100 - y_pred[y_pred == -1].size) / 100.) assert(sum(y_pred == -1) == sum(decisions < 0))
def transform( features, labels ): # for ff, ll in zip(features, labels): # print ll, ff # for rr in range(0, len(features) ): # features[rr] = scaler.fit_transform( features[rr] ) print "transforming features via pca" pca = PCA(n_components = 30) features = pca.fit_transform( features ) envelope = EllipticEnvelope() envelope.fit( features ) print envelope.predict( features ) scaler = MinMaxScaler() features = scaler.fit_transform( features ) return features, labels
def detect_outliers(X, station): if station=='hoerning': outlierfraction = 0.0015 classifier = svm.OneClassSVM(nu=0.95*outlierfraction + 0.05, kernel='rbf', gamma=0.1) Xscaler = StandardScaler(copy=True, with_mean=True, with_std=True).fit(X) X_scaled = Xscaler.transform(X) classifier.fit(X_scaled) svcpred = classifier.decision_function(X_scaled).ravel() threshold = stats.scoreatpercentile(svcpred, 100*outlierfraction) inlierpred = svcpred>threshold else: outlierfraction = 0.0015 classifier = EllipticEnvelope(contamination=outlierfraction) classifier.fit(X) gausspred = classifier.decision_function(X).ravel() threshold = stats.scoreatpercentile(gausspred, 100*outlierfraction) inlierpred = gausspred>threshold return inlierpred
def CovEstOD(data, classifier=None, N=1, **kw): if classifier is None: from sklearn.covariance import EllipticEnvelope contamination = N / data.shape[0] classifier = EllipticEnvelope(support_fraction=1., contamination=contamination) classifier.fit(data) clipix, = np.where( classifier.predict(data) == -1) wdb = kw.pop( 'with_decision_boundary', False ) #TODO: A better way of finding the decision boundary if wdb: w,T = np.linalg.eigh( clf.precision_ ) #T (eigenvectors of precision matrix) is the transformation matrix between principle axes and data coordinates Ti = np.linalg.inv(T) M = np.dot(Ti, clf.precision_) * T #Diagonalizing the precision matrix ==> quadratic representation of decision boundary (ellipse): z^T M z = threshold. where x-<x> = Tz transforms to principle axes a, b = np.sqrt(clf.threshold / np.diag(M)) #semi-major & semi-minor axes theta = np.degrees( np.arccos(T[0,0]) ) #T is (im)proper rotation matrix theta = np.linalg.det(T) * theta #If det(T)=-1 ==> improper rotation matrix (rotoinversion - one of the axes is inverted) decision_boundary = Ellipse( clf.location_, 2*a, 2*b, theta, color='m' ) return clipix, decision_boundary else: return clipix
def find_outlier_train(ser, outliers_fraction=0.1, min_units=0.2): # Returns outlier, inliers X = ser[ser>min_units].reshape(-1,1) #is_normal_data = is_normal(ser) # FOR NOW only using Robust estimator of Covariance is_normal_data = True if is_normal_data: # Use robust estimator of covariance from sklearn.covariance import EllipticEnvelope clf = EllipticEnvelope(contamination=.1) else: #Data is not normally distributed, use OneClassSVM based outlier detection from sklearn import svm clf = svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05, kernel="rbf", gamma=0.1) from scipy import stats clf.fit(X) y_pred = clf.decision_function(X).ravel() threshold = stats.scoreatpercentile(y_pred, 100 * outliers_fraction) y_pred = y_pred > threshold return ser[ser>min_units][~y_pred], ser[ser>min_units][y_pred]
def anomaly_detection(features, labels): # In this function, I try to use anomaly detection method (using mutivariate gaussian distribution) to identify poi-s non_pois = features[labels==0] pois = features[labels==1] print "non poi size", non_pois.shape, pois.shape, features.shape ## Spliting data to train, test and cross validation set for anomaly detection split1 = produce_spliting_array(non_pois.shape[0], .75 ) X_train = non_pois[split1==1] X_intermediate = non_pois[split1==0] print "size intermediate", X_intermediate.shape split2 = produce_spliting_array(X_intermediate.shape[0], .5 ) X_test = X_intermediate[split2==1] label_test = np.zeros((X_test.shape[0],), dtype=np.int) - 1 X_cv = X_intermediate[split2==0] label_cv = np.zeros((X_cv.shape[0],), dtype=np.int) - 1 split3 = produce_spliting_array(pois.shape[0], .5 ) X_test = np.vstack((X_test, pois[split3==1])) label_test = np.hstack((label_test, np.ones(sum(split3), dtype=np.int))) X_cv = np.vstack((X_cv, pois[split3==0])) label_cv = np.hstack((label_cv, np.ones(sum(split3==0), dtype=np.int))) print "size X_train", X_train.shape print "size test data", X_test.shape, label_test.shape print "size cv data", X_cv.shape, label_cv.shape print "size splits", len(split1), len(split2), len(split3) from sklearn.covariance import EllipticEnvelope detector = EllipticEnvelope(contamination=.85) detector.fit(X_train) pred_cv = detector.predict(X_cv) print pred_cv print label_cv print detector.score(X_cv, label_cv)
cls_nums=np.array(np.unique(y_trn, return_counts=True)).T ol_label=cls_nums[0,0] ol_count=cls_nums[0,1] il_label=cls_nums[1,0] il_count=cls_nums[1,1] if cls_nums[0,1]>cls_nums[1,1]: ol_label=cls_nums[1,0] ol_count=cls_nums[1,1] il_label=cls_nums[0,0] il_count=cls_nums[0,1] outlier_fraction = ol_count/n_samples print("Outlier fraction: {}".format(outlier_fraction)) clsf_names=["Robust covariance", "One-class SVM", "Isolation Forest","Local Outlier Factor"] anomaly_algorithms = [EllipticEnvelope(contamination=outlier_fraction), svm.OneClassSVM(nu=outlier_fraction, kernel="rbf",gamma=0.1), IsolationForest(contamination=outlier_fraction,random_state=42), LocalOutlierFactor(n_neighbors=35, contamination=outlier_fraction)] if args.admodel<0 or args.admodel>3: print("Anomal detection algorithm ID should be between 0 and 3") exit() clsf = anomaly_algorithms[args.admodel] clsf.fit(x_trn) if clsf_names[args.admodel] == "Local Outlier Factor": y_pred = clsf.fit_predict(x_tst) else: y_pred = clsf.predict(x_tst)
def test_elliptic_envelope(): rnd = np.random.RandomState(0) X = rnd.randn(100, 10) clf = EllipticEnvelope(contamination=0.1) with pytest.raises(NotFittedError): clf.predict(X) with pytest.raises(NotFittedError): clf.decision_function(X) clf.fit(X) y_pred = clf.predict(X) scores = clf.score_samples(X) decisions = clf.decision_function(X) assert_array_almost_equal(scores, -clf.mahalanobis(X)) assert_array_almost_equal(clf.mahalanobis(X), clf.dist_) assert_almost_equal(clf.score(X, np.ones(100)), (100 - y_pred[y_pred == -1].size) / 100.) assert (sum(y_pred == -1) == sum(decisions < 0))
def find_outliers(datestart,dateend,plot=False,cut=-0.05): numtopics=84 di=datetime2str2(datestart) dfin=datetime2str2(dateend) #print di,dfin if dfin<di: temp=dfin dfin=di di=temp #print di,dfin afile="/home/ubuntu/mysql_insightwiki_auth.txt" a=open(afile) passwd=a.readline().rstrip() a.close() host='localhost'; user='******';db='wikidata' con = mdb.connect(host, user, passwd, db)#,port=3307) with con: curt= con.cursor() #sql="SELECT COUNT(*) FROM `topics` " sql="SELECT `Id`,`topic_label`,`topic_string` FROM `topics`;" curt.execute(sql) topics=[[0,'nothing','Filler to match index']] for topic in curt: topics.append(topic) data={} df=range(numtopics+1) with con: curt= con.cursor() sql="SELECT `Id`,`topic_label`,`topic_string` FROM `topics`;" curt.execute(sql) for row in curt: cur = con.cursor() sql='''SELECT `page_views`.`dateonly` AS `vd`, AVG(`page_views`.`count`) AS `vc`, `topics`.`topic_label`,`topics`.`topic_string` FROM `topics` INNER JOIN `page_views` ON `topics`.`ID` = `page_views`.`topic_id` WHERE `topic_id`=%s GROUP BY `page_views`.`dateonly` ''' data[row[1]]=read_sql(sql, con,params=[row[0]]) df[row[0]]=data[row[1]] topicdata=df d=topicdata[topics[3][0]] p=d[ (d['vd']>di) & (d['vd']<dfin )]['vc'].values topicdata=df #initializing array to hold the rows to cluster #the 0th position is fake so that my index matches the sql index clusinp=[] clusinp.append(gen_feat([0,0,0,0,0])) chinaoff=6000 #populating my array to go into my Kmean for index,topic in enumerate(topics): #topic=list(topics[index]) if topic[0]!=0: d=topicdata[topic[0]] ppre=d[ (d['vd']>di) & (d['vd']<dfin )]['vc'].values p=gen_feat(ppre) if topic[0]==52: p=gen_feat([x-chinaoff if x-chinaoff>=0 else 0 for x in ppre ]) clusinp.append(p) #cleaning up my array making it numpy to go into my kmean clusinp=np.array(clusinp) clusinp[0]=clusinp[5] #making sure my through away first row matches in size #contam=0.325 contamfix=0.1 colors = ['m', 'g', 'b'] X1=clusinp xx1, yy1 = np.meshgrid(np.linspace(0, 10000, 500), np.linspace(-1.5, 1.5, 500)) ee=EllipticEnvelope(support_fraction=1., contamination=contamfix) #ee=OneClassSVM(nu=contam2, gamma=0.05,kernel='rbf') ee.fit(clusinp) outliers=ee.decision_function(X1, raw_values=False) if plot==True: print "here" get_ipython().magic(u'matplotlib inline') Z1 = ee.decision_function(np.c_[xx1.ravel(), yy1.ravel()]) Z1 = Z1.reshape(xx1.shape) legend1 = plt.contour(xx1, yy1, Z1, levels=[0], linewidths=2, colors=colors[1]) plt.scatter(X1[:, 0], X1[:, 1], color='black') plt.xlim((xx1.min(), xx1.max())) plt.ylim((yy1.min(), yy1.max())) plt.show() out=[] for index,outlier in enumerate(outliers): row=[index,outlier,topics[index][1],int(np.round(clusinp[index][0])),int(np.round(100*clusinp[index][1]))] #row=[index,outlier,topics[index][1],int(np.round(clusinp[index][0])),clusinp[index][1]] if outlier<cut and index!=0 and row[3]>8: out.append(row) #print index,outlier,topics[index][2],clusinp[index][0],clusinp[index][1] #out=sorted(out,operator.itemgetter(4)) #out.sort() out=sorted(out,key =lambda x:-x[4]) return out
# , # label=target_name.decode('utf8') ) x, y = find_boundary(X_transformed[kclusters == i, 0], X_transformed[kclusters == i, 1], 5) plt.plot(x, y, '-k', lw=2., color=cluster_color) # create a mesh to plot in h = .02 # step size in the mesh x_min, x_max = X_transformed[kclusters == i, 0].min() - 1, X_transformed[kclusters == i, 0].max() + 1 y_min, y_max = X_transformed[kclusters == i, 1].min() - 1, X_transformed[kclusters == i, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) clf = EllipticEnvelope(contamination=.1) clf.fit(X_transformed[kclusters == i]) pred = clf.decision_function(X_transformed[kclusters == i]).ravel() threshold = stats.scoreatpercentile(pred, 100 * outliers_fraction) print("INFO: Cluster: ", i, " Threshold: ", threshold) Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) # plt.contour(xx, yy, Z, # levels=[threshold], # linewidths=2, # linestyles='solid', # colors=(cluster_color,))
def __getRemovedOutlierRobustDf(self, _df, _column_drop=True): robust = EllipticEnvelope(contamination=OUTLIER_FRACTION) return self.__getRemoveOutlierDf(_df, robust, self.column_name_robust, _column_drop)
def elliptic_envelope(series, contamination=0.1): clf = EllipticEnvelope(contamination=contamination, random_state=0) series = series.values.reshape(-1, 1) clf.fit(series) return clf.predict(series)
"robust covariance estimator": EllipticEnvelope(contamination=.1)} # Compare given classifiers under given settings xx, yy = np.meshgrid(np.linspace(-0.1, 1.1, 1000), np.linspace(0, 100, 1000)) n_inliers = int((1. - outliers_fraction) * n_samples) n_outliers = int(outliers_fraction * n_samples) # Fit the problem with varying cluster separation np.random.seed(42) # Data generation # Fit the model with the One-Class SVM #plt.figure(figsize=(10, 5)) clf = EllipticEnvelope(contamination=.1) # fit the data and tag outliers clf.fit(XY) y_pred = clf.decision_function(XY).ravel() threshold = stats.scoreatpercentile(y_pred, 100 * outliers_fraction) y_pred = y_pred > threshold # plot the levels lines and the points Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) subplot = ax[i] subplot.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7), cmap=plt.cm.Blues_r) a = subplot.contour(xx, yy, Z, levels=[threshold], linewidths=2, colors='red') subplot.contourf(xx, yy, Z, levels=[threshold, Z.max()],
def anomaly_detection_AUC_experiment_batch(anomaly_method, dataset, X_train_in_folds, X_test_in_folds, y_train_in_folds, y_test_in_folds): rng = np.random.RandomState(42) n_folds = len(X_train_in_folds) auc_test_array = np.zeros((n_folds,)) auc_train_array = np.zeros((n_folds,)) time_of_algorithm_test = np.zeros((n_folds,)) time_of_algorithm_train = np.zeros((n_folds,)) for fold_index in range(n_folds): X_train = X_train_in_folds[fold_index] X_test = X_test_in_folds[fold_index] y_train = y_train_in_folds[fold_index] y_test = y_test_in_folds[fold_index] if fold_index == 0: y = list(y_train) y.extend(y_test) y = np.asarray(y) # print(y) percentage_of_anomalies = sum(y == -1) / len(y) print("percentage of the anomalies = " + str(percentage_of_anomalies)) if anomaly_method == "iso_forest": clf = IsolationForest(random_state=rng) start = time.time() clf.fit(X=X_train) scores_train = clf.decision_function(X=X_train) end = time.time() time_of_algorithm_train[fold_index] = end - start start = time.time() scores_test = clf.decision_function(X=X_test) end = time.time() time_of_algorithm_test[fold_index] = end - start elif anomaly_method == "one_class_SVM": clf = OneClassSVM(gamma='auto') start = time.time() clf.fit(X=X_train) scores_train = clf.decision_function(X=X_train) end = time.time() time_of_algorithm_train[fold_index] = end - start start = time.time() scores_test = clf.decision_function(X=X_test) end = time.time() time_of_algorithm_test[fold_index] = end - start elif anomaly_method == "LOF": n_neighbors = 10 clf = LOF(n_neighbors=n_neighbors, contamination=0.1) start = time.time() clf.fit(X=X_train) scores_train = clf.negative_outlier_factor_ end = time.time() time_of_algorithm_train[fold_index] = end - start clf = LOF(n_neighbors=n_neighbors, novelty=True, contamination=0.1) start = time.time() clf.fit(X=X_train) scores_test = clf.decision_function(X=X_test) end = time.time() time_of_algorithm_test[fold_index] = end - start elif anomaly_method == "covariance_estimator": clf = EllipticEnvelope(random_state=rng) start = time.time() clf.fit(X=X_train) scores_train = clf.decision_function(X=X_train) end = time.time() time_of_algorithm_train[fold_index] = end - start start = time.time() scores_test = clf.decision_function(X=X_test) end = time.time() time_of_algorithm_test[fold_index] = end - start elif anomaly_method == "iMondrian_forest": settings, data, param, cache, train_ids_current_minibatch = MondrianForest.prepare_training_data(X=X_train, num_trees=100) clf = MondrianForest(settings, data) subsampling_size = 256 start = time.time() # clf.fit(data, train_ids_current_minibatch, settings, param, cache, subsampling_size=None) clf.fit(data, train_ids_current_minibatch, settings, param, cache, subsampling_size=subsampling_size) scores, scores_shifted = clf.get_anomaly_scores(test_data=X_train, settings=settings, subsampling_size=None) scores_train = scores_shifted end = time.time() time_of_algorithm_train[fold_index] = end - start start = time.time() scores, scores_shifted = clf.get_anomaly_scores(test_data=X_test, settings=settings, subsampling_size=None) scores_test = scores_shifted end = time.time() time_of_algorithm_test[fold_index] = end - start # scores_test = -1 * scores_test #--> to have: the more score, the less anomaly fpr_test, tpr_test, thresholds_test = metrics.roc_curve(y_test, scores_test, pos_label=1) #--> https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_curve.html fpr_train, tpr_train, thresholds_train = metrics.roc_curve(y_train, scores_train, pos_label=1) # plt.plot(fpr_test, tpr_test) # plt.show() # plt.plot(fpr_train, tpr_train) # plt.show() auc_test = metrics.auc(fpr_test, tpr_test) #--> https://scikit-learn.org/stable/modules/generated/sklearn.metrics.auc.html print("Fold: " + str(fold_index) + " ---> AUC for test: " + str(auc_test)) auc_test_array[fold_index] = auc_test auc_train = metrics.auc(fpr_train, tpr_train) print("Fold: " + str(fold_index) + " ---> AUC for train: " + str(auc_train)) auc_train_array[fold_index] = auc_train auc_test_mean = auc_test_array.mean() auc_test_std = auc_test_array.std() auc_train_mean = auc_train_array.mean() auc_train_std = auc_train_array.std() time_of_algorithm_train_mean = time_of_algorithm_train.mean() time_of_algorithm_train_std = time_of_algorithm_train.std() time_of_algorithm_test_mean = time_of_algorithm_test.mean() time_of_algorithm_test_std = time_of_algorithm_test.std() print("Average AUC for test data: " + str(auc_test_mean) + " +- " + str(auc_test_std)) print("Average time for test data: " + str(time_of_algorithm_test_mean) + " +- " + str(time_of_algorithm_test_std)) print("Average AUC for train data: " + str(auc_train_mean) + " +- " + str(auc_train_std)) print("Average time for train data: " + str(time_of_algorithm_train_mean) + " +- " + str(time_of_algorithm_train_std)) if anomaly_method == "LOF" or anomaly_method == "CAD": path = './output/batch/' + dataset + "/" + anomaly_method + "/neigh=" + str(n_neighbors) + "/" else: path = './output/batch/' + dataset + "/" + anomaly_method + "/" save_np_array_to_txt(variable=auc_test_array, name_of_variable="auc_test_array", path_to_save=path) save_np_array_to_txt(variable=auc_test_mean, name_of_variable="auc_test_mean", path_to_save=path) save_np_array_to_txt(variable=auc_test_std, name_of_variable="auc_test_std", path_to_save=path) save_np_array_to_txt(variable=auc_train_array, name_of_variable="auc_train_array", path_to_save=path) save_np_array_to_txt(variable=auc_train_mean, name_of_variable="auc_train_mean", path_to_save=path) save_np_array_to_txt(variable=auc_train_std, name_of_variable="auc_train_std", path_to_save=path) save_np_array_to_txt(variable=time_of_algorithm_test, name_of_variable="time_of_algorithm_test", path_to_save=path) save_np_array_to_txt(variable=time_of_algorithm_test_mean, name_of_variable="time_of_algorithm_test_mean", path_to_save=path) save_np_array_to_txt(variable=time_of_algorithm_test_std, name_of_variable="time_of_algorithm_test_std", path_to_save=path) save_np_array_to_txt(variable=time_of_algorithm_train, name_of_variable="time_of_algorithm_train", path_to_save=path) save_np_array_to_txt(variable=time_of_algorithm_train_mean, name_of_variable="time_of_algorithm_train_mean", path_to_save=path) save_np_array_to_txt(variable=time_of_algorithm_train_std, name_of_variable="time_of_algorithm_train_std", path_to_save=path) save_np_array_to_txt(variable=percentage_of_anomalies, name_of_variable="percentage_of_anomalies", path_to_save=path)
from sklearn.covariance import EllipticEnvelope import numpy as np # aaa = np.array([[1,2,-10000,3,4,6,7,8,90,100,5000]]) aaa = np.transpose(aaa) # sklearn는 대부분 벡터로 인풋을 받음 outlier = EllipticEnvelope(contamination=.1) # "outlier가 10퍼센트 있다고 간주하고 이상치를 찾아라" # 통상적으로 10퍼센트 미만으로 설정함 # 가우스 분포와 공분산을 사용하여 처리하는 것이 차이 outlier.fit(aaa) print(outlier.predict(aaa)) # contamination=.3 # [ 1 1 -1 1 1 1 1 1 1 -1 -1] # contamination=.2 # [ 1 1 -1 1 1 1 1 1 1 1 -1] # contamination=.1 # [ 1 1 -1 1 1 1 1 1 1 1 1] # ====================== # 1차원 말고 2차원도 될까? # 응, 된다 # 기준은 열 # aaa = np.array([[1,2,3,4,10000,6,7,5000,90,100], # [1000,2000,3,4000,5000,6000,7000,8,9000,10000]]) # [ 1 1 1 1 -1 1 1 1 1 1] # "(10, 2) 크기의 데이터 전체를 봤을 때 5 row에 어딘가 이상치가 있다"
def AnomalyDetection(filepath): train_X = np.loadtxt(filepath+'normalized_train_file.csv', delimiter=',', dtype=float, skiprows=1) test_X = np.loadtxt(filepath+'pseudonormalized_test_file.csv', delimiter=',',dtype=float, skiprows=1) train_Y = np.loadtxt(filepath+'Y_train_file.csv', delimiter=',',dtype=float, skiprows=1) test_Y = np.loadtxt(filepath+'Y_test_file.csv', delimiter=',', dtype=float, skiprows=1) input_dimensions = str(train_X.shape[1]) #feature length samples_size =str(train_X.shape[0]) #number of rows input_dimensions_test = str(test_X.shape[1] )#feature length samples_size_test = str(test_X.shape[0]) #number of rows num_failed_train = train_Y[train_Y==1].shape[0] num_failed_test = test_Y[test_Y==1].shape[0] with open(filepath+'outliers_new_results.txt', 'w') as output: output.write("===== DATA INFORMATION =====\n") output.write('training data size: ' +samples_size +' by '+ input_dimensions+'\n') output.write('test data size: ' +samples_size_test +' by '+ input_dimensions_test+'\n') output.write('failed points in training: ' + str(num_failed_train)) output.write('failed points in testing: ' + str(num_failed_test)) #change input data for this method: training = train_X[np.where(train_Y==0)] testing = np.concatenate((test_X,train_X[np.where(train_Y==1)])) testing_Y = np.concatenate((test_Y,train_Y[np.where(train_Y==1)])) input_dimensions = str(training.shape[1]) #feature length samples_size =str(training.shape[0]) #number of rows input_dimensions_test = str(testing.shape[1] )#feature length samples_size_test = str(testing.shape[0]) #number of rows ##################################################################### # ONE CLASS SVM ##################################################################### print() print('One Class SVM') # healthy data to train only print() output.write("\n===== ONE CLASS SVM =====\n") output.write("===== DATA INFORMATION FOR THIS METHOD =====\n") output.write('training data size: ' +samples_size +' by '+ input_dimensions+'\n') output.write('test data size: ' +samples_size_test +' by '+ input_dimensions_test+'\n') output.write('training set is all healthy data, testing set contains other data and all failed points\n') clf = svm.OneClassSVM(nu=0.15, kernel='rbf', gamma=0.75) # nu=0.15 clf.fit(training) with open(filepath+'svm_one_class.pickle','wb') as f: pickle.dump(clf,f) y_pred_train = clf.predict(training) y_pred_test = clf.predict(testing) anomaly_detection_error(y_pred_train, train_Y[train_Y==0], "training", output, filepath+'OneClassSVM', OneClassSVMMethod=True) anomaly_detection_error(y_pred_test, testing_Y, "testing", output, filepath+'OneClassSVM', OneClassSVMMethod=True) ##################################################################### # ISOLATION FOREST ##################################################################### print() print('IsolationForest') print() output.write("\n===== ISOLATION FOREST =====\n") # Example settings n_samples = 100 samples_max = 0.7336951612320737 contamination_fraction = 0.11294048783176784 clf = IsolationForest(n_estimators=n_samples, max_samples=samples_max, contamination=contamination_fraction, random_state=0) clf.fit(train_X) with open(filepath+'IsolationForest.pickle','wb') as f: pickle.dump(clf,f) y_pred_train = clf.predict(train_X) y_pred_test = clf.predict(test_X) anomaly_detection_error(y_pred_train, train_Y, "training", output, filepath+'Isolation Forest') anomaly_detection_error(y_pred_test, test_Y, "testing", output, filepath+'Isolation Forest') ##################################################################### # ELLIPTIC ENVELOPE ##################################################################### print() print('Elliptic Envelope') print() output.write("\n===== ELLIPTIC ENVELOPE =====\n") clf = EllipticEnvelope(contamination=0.175, random_state=0) clf.fit(train_X) with open(filepath+'EllipticEnvelope.pickle','wb') as f: pickle.dump(clf,f) y_pred_train = clf.predict(train_X) y_pred_test = clf.predict(test_X) anomaly_detection_error(y_pred_train, train_Y, "training", output, filepath+'EE') anomaly_detection_error(y_pred_test, test_Y, "testing", output, filepath+'EE') ##################################################################### # LOCAL OUTLIER FACTOR ##################################################################### print() print('Local Outlier Factor') print() output.write("\n=====LOCAL OUTLIER FACTOR =====\n'") for i in [100, 150, 200, 500, 1000]: clf = LocalOutlierFactor(n_neighbors=i, contamination=0.25) y_pred_train = clf.fit_predict(train_X) y_pred_test = clf._predict(test_X) anomaly_detection_error(y_pred_train, train_Y, "training", output, filepath+'LOF') anomaly_detection_error(y_pred_test, test_Y, "testing", output, filepath+'LOF') with open('R:\\SMM-Structures\\A1-010391 (Navy IPMS data analytics)\\Technical\\Data\\datafiles\\'+'LOF {} neighbours.pickle'.format(i),'wb') as f: pickle.dump(clf,f) print()
def get_monitoring_tools(X, y): """ determine outlier and distance thresholds return thresholds, outlier model(s) and source distributions for distances NOTE: for classification the outlier detection on y is not needed """ preprocessor = get_preprocessor() preprocessor = preprocessor.fit(X) X_pp = preprocessor.transform(X) xpipe = Pipeline(steps=[( 'pca', PCA(2)), ('clf', EllipticEnvelope(random_state=0, contamination=0.01))]) xpipe.fit(X_pp) bs_samples = 1000 outliers_X = np.zeros(bs_samples) wasserstein_X = np.zeros(bs_samples) wasserstein_y = np.zeros(bs_samples) for b in range(bs_samples): n_samples = int(np.round(0.80 * X.shape[0])) subset_indices = np.random.choice(np.arange(X.shape[0]), n_samples, replace=True).astype(int) y_bs = y[subset_indices] X_bs = X_pp[subset_indices, :] test1 = xpipe.predict(X_bs) wasserstein_X[b] = wasserstein_distance(X_pp.flatten(), X_bs.flatten()) wasserstein_y[b] = wasserstein_distance(y, y_bs.flatten()) outliers_X[b] = 100 * (1.0 - (test1[test1 == 1].size / test1.size)) ## determine thresholds as a function of the confidence intervals outliers_X.sort() outlier_X_threshold = outliers_X[int(0.975 * bs_samples)] + outliers_X[int( 0.025 * bs_samples)] wasserstein_X.sort() wasserstein_X_threshold = wasserstein_X[int( 0.975 * bs_samples)] + wasserstein_X[int(0.025 * bs_samples)] wasserstein_y.sort() wasserstein_y_threshold = wasserstein_y[int( 0.975 * bs_samples)] + wasserstein_y[int(0.025 * bs_samples)] to_return = { "outlier_X": np.round(outlier_X_threshold, 1), "wasserstein_X": np.round(wasserstein_X_threshold, 2), "wasserstein_y": np.round(wasserstein_y_threshold, 2), "preprocessor": preprocessor, "clf_X": xpipe, "X_source": X_pp, "y_source": y, "latest_X": X, "latest_y": y } return (to_return)
#!/usr/bin/env python #-*- coding:utf-8 -*- import numpy as np from sklearn.covariance import EllipticEnvelope import matplotlib.pyplot as plt X1 = np.loadtxt('slocbool.txt') ee = EllipticEnvelope(support_fraction=1., contamination=0.02) xx, yy = np.meshgrid(np.linspace(0, 1500000, 542), np.linspace(0, 15000, 542)) ee.fit(X1) Z = ee.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.figure(1) plt.title("Outlier detection: SLOC vs BOOL") plt.scatter(X1[:, 0], X1[:, 1], color='black') plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='m') plt.ylabel("count of boolean expressions") plt.xlabel("count of source lines of code") plt.show()
def detect(file_path, space, deleted_features): """ Detect outliers """ start_time = time.time() print("==================================================") print("Outlier detection and treatment started ...") print("Space:", space) X = pd.read_csv(file_path) if len(deleted_features) > 0: X = X.drop(deleted_features, axis=1, inplace=False) # Basic data cleaning X = data_cleaning_formatting(X) y_predicted = None params = space['params'] error = dict() try: if space['model'] == "DBSCAN": model = DBSCAN(**params) y_predicted = model.fit_predict(X) y_predicted = list(map(lambda x: 1 if x < 0 else 0, y_predicted)) elif space['model'] == "OPTICS": model = OPTICS(**params) y_predicted = model.fit_predict(X) print(y_predicted) y_predicted = list(map(lambda x: 1 if x < 0 else 0, y_predicted)) elif space['model'] == "EllipticEnvelope": model = EllipticEnvelope(**params) y_predicted = model.fit_predict(X) y_predicted = list(map(lambda x: 1 if x == -1 else 0, y_predicted)) elif space['model'] == "IsolationForest": model = IsolationForest(**params) with parallel_backend('threading'): y_predicted = model.fit_predict(X) y_predicted = list(map(lambda x: 1 if x == -1 else 0, y_predicted)) elif space['model'] == "OneClassSVM": model = OneClassSVM(**params) y_predicted = model.fit_predict(X) y_predicted = list(map(lambda x: 1 if x == -1 else 0, y_predicted)) elif space['model'] == "LocalOutlierFactor": model = LocalOutlierFactor(**params) with parallel_backend('threading'): y_predicted = model.fit_predict(X) y_predicted = list(map(lambda x: 1 if x == -1 else 0, y_predicted)) elif space['model'] == "zscore": model = ZScore(threshold=params['threshold']) y_predicted = model.fit_predict(X) except Exception as e: print("Error:", e) y_predicted = [0] * X.shape[0] error['detect_' + str(space)] = e if isinstance(y_predicted, list): y_predicted = np.array(y_predicted) time_taken = time.time() - start_time print("Time taken:", time_taken) return y_predicted
# One-Class SVM for real dataset import numpy as np import matplotlib.pyplot as plt from sklearn.datasets import load_boston from sklearn.covariance import EllipticEnvelope from scipy import stats # Get the data dataset = load_boston() data = dataset["data"][:, [8, 10]] # Two cluster data contamination = 0.261 # Fit the model clf = EllipticEnvelope(contamination=contamination) clf.fit(data) # Perform outlier detection predicted_data = clf.predict(data) inlier_predicted_data = data[predicted_data == 1] outlier_predicted_data = data[predicted_data == -1] num_inliers_predicted = inlier_predicted_data.shape[0] num_outliers_predicted = outlier_predicted_data.shape[0] # Plot decision function values xr = np.linspace(-5, 30, 500) yr = np.linspace(5, 30, 500) xx, yy = np.meshgrid(xr, yr) zz = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) zz = zz.reshape(xx.shape) scores = clf.decision_function(data)
def online_anomaly_detection(self, data_seasnCorec, baseline_window, sliding_window, outliers_fraction): if (mode == 'daily'): for i in range(baseline_window, len(data_seasnCorec)): data_train_w = data_seasnCorec[i - baseline_window:i] # train data normalization ------------------------------------------------------ data_train_w += 0.1 standardizer = StandardScaler().fit(data_train_w.values) data_train_scaled = standardizer.transform(data_train_w.values) data_train_scaled_features = pd.DataFrame( data_train_scaled, index=data_train_w.index, columns=data_train_w.columns) data = pd.DataFrame(data_train_scaled_features) data_1 = pd.DataFrame(data).fillna(0) data_1['steps'] = '0' data_1['steps_window_12'] = (data_1['steps']) data_train_w = data_1 data_train.append(data_train_w) data_test_w = data_seasnCorec[i:i + sliding_window] # test data normalization ------------------------------------------------------ data_test_w += 0.1 data_test_scaled = standardizer.transform(data_test_w.values) data_scaled_features = pd.DataFrame( data_test_scaled, index=data_test_w.index, columns=data_test_w.columns) data = pd.DataFrame(data_scaled_features) data_1 = pd.DataFrame(data).fillna(0) data_1['steps'] = '0' data_1['steps_window_12'] = (data_1['steps']) data_test_w = data_1 data_test.append(data_test_w) # fit the model ------------------------------------------------------ model = EllipticEnvelope( random_state=RANDOM_SEED, contamination=outliers_fraction, support_fraction=0.7).fit(data_train_w) # predict the test set preds = model.predict(data_test_w) #preds = preds.rename(lambda x: 'anomaly' if x == 0 else x, axis=1) dfs.append(preds) else: for i in range(baseline_window, len(data_seasnCorec)): if ((i - baseline_window) // 24 % 7 == 0): recent_index = i data_train_w = data_seasnCorec[i - baseline_window:i] data_train_w += 0.1 standardizer = StandardScaler().fit(data_train_w.values) data_train_scaled = standardizer.transform( data_train_w.values) data_train_scaled_features = pd.DataFrame( data_train_scaled, index=data_train_w.index, columns=data_train_w.columns) data = pd.DataFrame(data_train_scaled_features) data_1 = pd.DataFrame(data).fillna(0) data_1['steps'] = '0' data_1['steps_window_12'] = (data_1['steps']) data_train_w = data_1 data_train.append(data_train_w) else: data_train_w = data_seasnCorec[ recent_index - baseline_window:recent_index] data_train_w += 0.1 standardizer = StandardScaler().fit(data_train_w.values) data_train_scaled = standardizer.transform( data_train_w.values) data_train_scaled_features = pd.DataFrame( data_train_scaled, index=data_train_w.index, columns=data_train_w.columns) data = pd.DataFrame(data_train_scaled_features) data_1 = pd.DataFrame(data).fillna(0) data_1['steps'] = '0' data_1['steps_window_12'] = (data_1['steps']) data_train_w = data_1 data_train.append(data_train_w) data_test_w = data_seasnCorec[i:i + sliding_window] # test data normalization ------------------------------------------------------ data_test_w += 0.1 data_test_scaled = standardizer.transform(data_test_w.values) data_scaled_features = pd.DataFrame( data_test_scaled, index=data_test_w.index, columns=data_test_w.columns) data = pd.DataFrame(data_scaled_features) data_1 = pd.DataFrame(data).fillna(0) data_1['steps'] = '0' data_1['steps_window_12'] = (data_1['steps']) data_test_w = data_1 data_test.append(data_test_w) model = EllipticEnvelope( random_state=RANDOM_SEED, contamination=outliers_fraction, support_fraction=0.7).fit(data_train_w) # predict the test set preds = model.predict(data_test_w) #preds = preds.rename(lambda x: 'anomaly' if x == 0 else x, axis=1) dfs.append(preds)
import numpy as np from sklearn.covariance import EllipticEnvelope from sklearn.svm import OneClassSVM import matplotlib.pyplot as plt import matplotlib.font_manager from sklearn.datasets import load_boston # Get data X1 = load_boston()['data'][:, [8, 10]] # two clusters X2 = load_boston()['data'][:, [5, 12]] # "banana"-shaped # Define "classifiers" to be used classifiers = { "Empirical Covariance": EllipticEnvelope(support_fraction=1., contamination=0.261), "Robust Covariance (Minimum Covariance Determinant)": EllipticEnvelope(contamination=0.261), "OCSVM": OneClassSVM(nu=0.261, gamma=0.05) } colors = ['m', 'g', 'b'] legend1 = {} legend2 = {} # Learn a frontier for outlier detection with several classifiers xx1, yy1 = np.meshgrid(np.linspace(-8, 28, 500), np.linspace(3, 40, 500)) xx2, yy2 = np.meshgrid(np.linspace(3, 10, 500), np.linspace(-5, 45, 500)) for i, (clf_name, clf) in enumerate(classifiers.items()): plt.figure(1) clf.fit(X1)
def get_update_annots(ls_in): f_idx, file, a_usr, b_sys, annots_usr, annots_sys, data_usr, data_sys, num_feat_per_person, pad_noise_bool, n_pre = ls_in data_pts = [] file_length = 0 # framelen = 0.05 framelen = 5 pred_len_max = 20 # turn_batch = 4 # get covariance matrix backup_cov = np.array(json.load(open('./tools/mean_cov.json', 'rb'))) # silence bools are for when booth speakers are silent silence_bools = np.where(annots_usr['prev_gap_silence_bools'])[0][1:] ipu_strts, ipu_ends = convert_to_ms_int_floor( annots_usr['ipu_start_times']), convert_to_ms_int_floor( annots_usr['ipu_end_times']) sil_strts, sil_ends = ipu_ends[silence_bools - 1], ipu_strts[silence_bools] try: na = np.where(sil_ends > sil_strts)[0] sil_strts, sil_ends = sil_strts[na], sil_ends[na] na = np.where(sil_strts[1:] >= sil_ends[:-1])[0] sil_strts[1:], sil_ends[1:] = sil_strts[na + 1], sil_ends[na + 1] except: print('error at file: ' + file) # assert all(sil_ends > sil_strts) # assert all(sil_strts[1:] > sil_ends[:-1]) # use this to estimate the covariance matrix of the OPPOSITE person # we do this because there is less of a chance that the other person will be making # noises such as lip smacks etc.. (prob better reasons) ls = [ np.arange(s, e) for s, e in zip(np.rint(sil_strts / framelen), np.rint(sil_ends / framelen)) if e > s ] if len(ls): l = np.concatenate(ls) silences = data_sys[l.astype(np.int)] else: print('bad file') print(file) # pad with zeros instead: silences = np.zeros([3, data_sys.shape[1]]) # old covariance estimation # self.sil_cov_matrices[file][b_sys] = np.cov(silences, rowvar=False) # self.sil_means[file][b_sys] = np.mean(silences, 0) # if padding test_seqs with noise, estimate elliptic covariance matrix to avoid outliers sil_means = np.mean(silences, 0) if pad_noise_bool: try: cov = EllipticEnvelope().fit(silences - sil_means) cov = cov.covariance_ except: cov = backup_cov else: cov = [] # get va annotations max_time = int( np.rint( max([ convert_to_ms_int_floor(annots_usr['end_time_words'][-1]), convert_to_ms_int_floor(annots_sys['end_time_words'][-1]) ]) / framelen)) def get_va_annots(annots, max_time): va_annots = np.zeros(max_time, dtype=np.int16) for wrd_strt, wrd_end in zip( convert_to_ms_int_floor(annots['start_time_words']), convert_to_ms_int_floor(annots['end_time_words'])): wrd_strt_f = int(np.rint(wrd_strt / framelen)) wrd_end_f = int(np.rint(wrd_end / framelen)) # (maybe) need to add plus 1 because of floor operator va_annots[wrd_strt_f:wrd_end_f] = 1 return va_annots va_annots_usr = get_va_annots(annots_usr, max_time) va_annots_sys = get_va_annots(annots_sys, max_time) # pad with extra values for predictions va_annots_usr = np.concatenate( [va_annots_usr, np.zeros(pred_len_max + 1, dtype=np.int16)]) va_annots_sys = np.concatenate( [va_annots_sys, np.zeros(pred_len_max + 1, dtype=np.int16)]) # hs_annots_sys = get_hs_annots(annots_sys, va_annots_usr, max_time) sys_update_start_frames = np.rint( convert_to_ms_int_floor( annots_usr['updates']['sys_update_strt_times'])[:-1] / framelen).astype(np.int32) sys_update_end_frames = np.rint( convert_to_ms_int_floor( annots_usr['updates']['sys_update_end_times'])[:-1] / framelen).astype(np.int32) # num_updates = len(annots_usr['updates']['sys_update_turns']) - 1 # ommit last update usr_updates, sys_updates, sys_turns = [], [], [] update_batch_list = [] for update_idx, (strt_fidx_update, end_fidx_update) in enumerate( zip(sys_update_start_frames, sys_update_end_frames)): # update_idx strt_t_update = convert_to_ms_int_floor( annots_usr['updates']['sys_update_strt_times'][update_idx]) end_t_update = convert_to_ms_int_floor( annots_usr['updates']['sys_update_end_times'][update_idx]) if strt_fidx_update == end_fidx_update: print('Update is zero length') pdb.set_trace() strt_fidx_update = strt_fidx_update - 1 # Get associated turns for the user usr_turn_words_start_time_ms_int = convert_to_ms_int_floor( annots_usr['turn_words_start_time']) usr_turn_words_end_time_ms_int = convert_to_ms_int_floor( annots_usr['turn_words_end_time']) usr_update_turns = \ (usr_turn_words_start_time_ms_int < strt_t_update) & (usr_turn_words_end_time_ms_int >= strt_t_update) | \ (usr_turn_words_start_time_ms_int >= strt_t_update) & (usr_turn_words_start_time_ms_int < end_t_update) | \ (usr_turn_words_start_time_ms_int < end_t_update) & ( usr_turn_words_end_time_ms_int >= end_t_update) usr_update_turns = np.where(usr_update_turns)[0] usr_update_turn_starts_t = annots_usr['turn_words_start_time'][ usr_update_turns] usr_update_turn_ends_t = annots_usr['turn_words_end_time'][ usr_update_turns] sys_update_turn_start_t = annots_sys['turn_words_start_time'][ update_idx] sys_update_turn_end_t = annots_sys['turn_words_end_time'][update_idx] sys_turn_enc_strt_f = int( np.rint( convert_to_ms_int_floor(sys_update_turn_start_t) / framelen)) sys_turn_enc_end_f = int( np.rint(convert_to_ms_int_floor(sys_update_turn_end_t) / framelen)) assert convert_to_ms_int_floor(sys_update_turn_end_t) == end_t_update sys_turn_full_over = annots_sys['turn_full_overlap'][update_idx] # If system turn is not in full overlap, get the user turn that it is associated with and the offset if not (sys_turn_full_over or update_idx == 0): # Associated user turn is the user turn that began directly before the system turn associated_usr_turn = usr_update_turns[np.where( convert_to_ms_int_floor(usr_update_turn_starts_t) < convert_to_ms_int_floor(sys_update_turn_start_t))[0][-1]] # hack. Also, catch other overlaps that aren't in sys_turn_full_over if annots_usr['turn_words_end_time'][ associated_usr_turn] > sys_update_turn_end_t: associated_usr_turn = -1 else: associated_usr_turn = -1 # Get associated IPUs for the user usr_ipu_start_time_ms_int = convert_to_ms_int_floor( annots_usr['ipu_start_times']) usr_ipu_end_time_ms_int = convert_to_ms_int_floor( annots_usr['ipu_end_times']) usr_update_ipus = \ (usr_ipu_start_time_ms_int < strt_t_update) & (usr_ipu_end_time_ms_int >= strt_t_update) | \ (usr_ipu_start_time_ms_int >= strt_t_update) & (usr_ipu_start_time_ms_int < end_t_update) | \ (usr_ipu_start_time_ms_int < end_t_update) & ( usr_ipu_end_time_ms_int >= end_t_update) usr_update_ipus = np.where(usr_update_ipus)[0] usr_update_ipus_starts_t = annots_usr['ipu_start_times'][ usr_update_ipus] usr_update_ipus_ends_t = annots_usr['ipu_end_times'][usr_update_ipus] if update_idx == 0: associated_usr_ipu = -1 associated_usr_ipu_strt_t = 0 associated_usr_ipu_end_t = -1 associated_usr_ipu_strt_f = 0 associated_usr_ipu_end_f = -1 # If system turn is not in full overlap, get the user turn that it is associated with and the offset elif not (associated_usr_turn == -1): # Associated user IPU is the user ipu that began directly before the system turn associated_usr_ipu = usr_update_ipus[np.where( convert_to_ms_int_floor(usr_update_ipus_starts_t) < convert_to_ms_int_floor(sys_update_turn_start_t))[0][-1]] associated_usr_ipu_strt_t = annots_usr['ipu_start_times'][ associated_usr_ipu] associated_usr_ipu_end_t = annots_usr['ipu_end_times'][ associated_usr_ipu] associated_usr_ipu_strt_f = int( np.rint( convert_to_ms_int_floor(associated_usr_ipu_strt_t) / framelen)) associated_usr_ipu_end_f = int( np.rint( convert_to_ms_int_floor(associated_usr_ipu_end_t) / framelen)) else: associated_usr_ipu = np.where( convert_to_ms_int_floor(annots_usr['ipu_start_times']) < convert_to_ms_int_floor(sys_update_turn_start_t))[0][-1] associated_usr_ipu_strt_t = annots_usr['ipu_start_times'][ associated_usr_ipu] associated_usr_ipu_end_t = annots_usr['ipu_end_times'][ associated_usr_ipu] associated_usr_ipu_strt_f = int( np.rint( convert_to_ms_int_floor(associated_usr_ipu_strt_t) / framelen)) associated_usr_ipu_end_f = int( np.rint( convert_to_ms_int_floor(associated_usr_ipu_end_t) / framelen)) associated_usr_ipu = -1 # get updates usr_update = data_usr[strt_fidx_update:end_fidx_update] sys_update = data_sys[strt_fidx_update:end_fidx_update] # continuous voice activity annotations cont_pred_vec_usr = va_annots_usr[strt_fidx_update:end_fidx_update + 20] cont_pred_vec_sys = va_annots_sys[strt_fidx_update:end_fidx_update + 20] # Get system turn for encoder sys_enc_feats = data_sys[sys_turn_enc_strt_f:sys_turn_enc_end_f] # Get test_seq # Find the first switch from silence to speech by the user after the system ground truth start and pad with silence noise. sil_indx = 0 while sil_indx < len(va_annots_usr[sys_turn_enc_strt_f:]) - 1: if va_annots_usr[sys_turn_enc_strt_f:][ sil_indx] == 0 and va_annots_usr[sys_turn_enc_strt_f:][ sil_indx + 1] == 1: break else: sil_indx += 1 # sil indx is one frame before the last of the silence frames sil_indx = sys_turn_enc_strt_f + sil_indx if (sil_indx - strt_fidx_update) == 0: sil_indx += 1 test_seq = data_usr[strt_fidx_update:sil_indx] try: assert test_seq.shape[0] > 0 except AssertionError: print('test seq shape is zero in file: ' + file) # Get train Y y_UT = np.zeros(len(usr_update), dtype=np.int16) # protect against turns that start on first frame of file y_train_strt = max([0, sys_turn_enc_strt_f - 1 - strt_fidx_update]) y_UT[y_train_strt:sys_turn_enc_end_f - strt_fidx_update - 1] = 1 if not any(y_UT == 1): print('bad') y_strt_t = sys_update_turn_start_t - \ annots_usr['updates']['sys_update_strt_times'][update_idx] y_end_t = sys_update_turn_end_t - \ annots_usr['updates']['sys_update_strt_times'][update_idx] y_strt_f = sys_turn_enc_strt_f - strt_fidx_update y_end_f = sys_turn_enc_end_f - strt_fidx_update associated_usr_ipu_strt_f = associated_usr_ipu_strt_f - strt_fidx_update associated_usr_ipu_end_f = associated_usr_ipu_end_f - strt_fidx_update # Get words # Candidate system turn encoding words and update words s_i = annots_sys['turn_words_start_indx'][update_idx] e_i = annots_sys['turn_words_end_indx'][update_idx] + 1 sys_enc_words = annots_sys['target_words'][s_i:e_i] sys_enc_word_strt_ts = annots_sys['start_time_words'][s_i:e_i] sys_enc_word_end_ts = annots_sys['end_time_words'][s_i:e_i] sys_update_word_strt_frames = np.rint( convert_to_ms_int_floor(sys_enc_word_strt_ts) / framelen) - strt_fidx_update sys_update_word_end_frames = np.rint( convert_to_ms_int_floor(sys_enc_word_end_ts) / framelen) - strt_fidx_update sys_enc_word_strt_frames = np.rint( convert_to_ms_int_floor(sys_enc_word_strt_ts) / framelen) - sys_turn_enc_strt_f sys_enc_word_end_frames = np.rint( convert_to_ms_int_floor(sys_enc_word_end_ts) / framelen) - sys_turn_enc_strt_f # User update words if not len(usr_update_turns): s_i, s_e = 0, 0 else: s_i = annots_usr['turn_words_start_indx'][usr_update_turns][0] e_i = annots_usr['turn_words_end_indx'][usr_update_turns][-1] + 1 usr_update_words = annots_usr['target_words'][s_i:e_i] usr_update_word_strt_ts = annots_usr['start_time_words'][s_i:e_i] usr_update_word_end_ts = annots_usr['end_time_words'][s_i:e_i] usr_update_word_strt_frames = np.rint( convert_to_ms_int_floor(usr_update_word_strt_ts) / framelen) - strt_fidx_update usr_update_word_end_frames = np.rint( convert_to_ms_int_floor(usr_update_word_end_ts) / framelen) - strt_fidx_update # test seq words usr_end_fs = np.rint( convert_to_ms_int_floor(annots_usr['end_time_words']) / framelen) test_wrd_indices = np.where((usr_end_fs >= strt_fidx_update) & (usr_end_fs < sil_indx))[0] if not len(test_wrd_indices): s_i, e_i = 0, 0 else: s_i, e_i = test_wrd_indices[0], test_wrd_indices[-1] + 1 test_words = annots_usr['target_words'][s_i:e_i] test_word_strt_ts = annots_usr['start_time_words'][s_i:e_i] test_word_end_ts = annots_usr['end_time_words'][s_i:e_i] test_word_strt_frames = np.rint( convert_to_ms_int_floor(test_word_strt_ts) / framelen) - strt_fidx_update test_word_end_frames = np.rint( convert_to_ms_int_floor(test_word_end_ts) / framelen) - strt_fidx_update # dialogue acts for sys encoding turn_ipu_start_indx = annots_sys['turn_ipu_start_indx'][update_idx] turn_ipu_end_indx = annots_sys['turn_ipu_start_indx'][update_idx + 1] # sys_enc_das = annots_sys['da_ISO_second_pass_vec'][turn_ipu_start_indx:turn_ipu_end_indx] sys_enc_da_strt_ts = annots_sys['ipu_start_times'][ turn_ipu_start_indx:turn_ipu_end_indx] sys_enc_da_end_ts = annots_sys['ipu_end_times'][ turn_ipu_start_indx:turn_ipu_end_indx] sys_enc_da_strt_frames = np.rint( convert_to_ms_int_floor(sys_enc_da_strt_ts) / framelen) - sys_turn_enc_strt_f sys_enc_da_end_frames = np.rint( convert_to_ms_int_floor(sys_enc_da_end_ts) / framelen) - sys_turn_enc_strt_f word_da_dict = { 'strt_t_update': strt_t_update, 'end_t_update': end_t_update, 'strt_fidx_update': strt_fidx_update, 'end_fidx_update': end_fidx_update, 'sys_enc_words': sys_enc_words, 'sys_enc_word_strt_ts': sys_enc_word_strt_ts, 'sys_enc_word_end_ts': sys_enc_word_end_ts, 'sys_update_words': sys_enc_words, 'sys_update_word_strt_frames': sys_update_word_strt_frames.astype(np.int16), 'sys_update_word_end_frames': sys_update_word_end_frames.astype(np.int16), 'sys_enc_word_strt_frames': sys_enc_word_strt_frames.astype(np.int16), 'sys_enc_word_end_frames': sys_enc_word_end_frames.astype(np.int16), 'usr_update_words': usr_update_words, 'usr_update_word_strt_ts': usr_update_word_strt_ts, 'usr_update_word_end_ts': usr_update_word_end_ts, 'usr_update_word_strt_frames': usr_update_word_strt_frames.astype(np.int16), 'usr_update_word_end_frames': usr_update_word_end_frames.astype(np.int16), 'test_words': test_words, 'test_word_strt_ts': test_word_strt_ts, 'test_word_end_ts': test_word_end_ts, 'test_word_strt_frames': test_word_strt_frames.astype(np.int16), 'test_word_end_frames': test_word_end_frames.astype(np.int16), # 'sys_enc_das': sys_enc_das, 'sys_enc_da_strt_ts': sys_enc_da_strt_ts, 'sys_enc_da_end_ts': sys_enc_da_end_ts, 'sys_enc_da_strt_frames': sys_enc_da_strt_frames, 'sys_enc_da_end_frames': sys_enc_da_end_frames } data_pts.append({ 'y_strt_f': [y_strt_f], 'y_strt_t': [y_strt_t], 'y_end_f': [y_end_f], 'y_end_t': [y_end_t], 'y_length': [len(sys_enc_feats)], 'associated_usr_ipu_strt_f': [associated_usr_ipu_strt_f], 'associated_usr_ipu_end_f': [associated_usr_ipu_end_f], 'usr_update': usr_update, 'sys_update': sys_update, 'sys_trn': [sys_enc_feats], 'test_seq': test_seq, 'file': [file], 'a_usr': [a_usr], 'update_strt_t': [annots_usr['updates']['sys_update_strt_times'][update_idx]], 'update_end_t': [annots_usr['updates']['sys_update_end_times'][update_idx]], 'update_strt_f': [strt_fidx_update], 'update_end_f': [end_fidx_update], 'associated_usr_turn': associated_usr_turn, 'update_idx': update_idx, 'y_UT': y_UT, 'va_usr': cont_pred_vec_usr, 'va_sys': cont_pred_vec_sys, 'word_da_dict': word_da_dict }) file_length += 1 return [data_pts, sil_means, cov, file, a_usr, b_sys, file_length]
answerIF_proba = pd.DataFrame({'target': answerIF_proba}) pickle.dump(ilf, open("../../data/model/IsolationForest", "wb")) ## Local Outlier Factor lof = LocalOutlierFactor(n_neighbors=2, novelty=True) lof.fit(data_scaled_means) answerLOF_proba = lof.decision_function(data_scaled_means) answerLOF_proba = 1 - ((answerLOF_proba - answerLOF_proba.min()) / (answerLOF_proba.max() - answerLOF_proba.min())) answerLOF_proba = pd.DataFrame({'target': answerLOF_proba}) pickle.dump(lof, open("../../data/model/LocalOutlierFactor", "wb")) ## Elliptic Envelope ee = EllipticEnvelope() ee.fit(data_scaled_means) answerEE_proba = ee.decision_function(data_scaled_means) answerEE_proba = 1 - (answerEE_proba - 3 * answerEE_proba.min()) * 10**12 answerEE_proba = pd.DataFrame({'target': answerEE_proba}) pickle.dump(ee, open("../../data/model/EllipticEnvelope", "wb")) ############## ### Soft voting voting_answer = pd.DataFrame({ 'target': ((answerIF_proba * 2 + answerLOF_proba * 1 + answerEE_proba * 2) / 5).T.apply(lambda x: -1 if x.values[0] > 0.4 else 1) })
## 5: Misaligned Gaussian Mixture blobs_params = dict(random_state=0, n_samples=n_inliers, n_features=3) X_misaligned = make_blobs(centers=[[-0.7, -0.7, -0.7], [0.7, 0.7, -0.7], [-0.7, 0.7, 0.7]], cluster_std=[0.2, 0.2, 0.2], **blobs_params)[0] ## 6: Whole dataset datasets3D = [X_lin, X_hex, X_sph, X_gau, X_misaligned] # Define to data label labels = np.concatenate([np.ones(n_inliers), -np.ones(n_outliers)], axis=0) # lbel 1 as inliers, -1 as outliers # define outlier/anomaly detection methods to be compared anomaly_algorithms = [("Robust covariance", EllipticEnvelope(contamination=outliers_fraction)), ("One-Class SVM", svm.OneClassSVM(nu=outliers_fraction, kernel="rbf", gamma=0.1)), ("Isolation Forest (IF)", IsolationForest(n_estimators=500, behaviour='new', contamination=outliers_fraction, random_state=42)), ("Local Outlier Factor", LocalOutlierFactor(n_neighbors=35, contamination=outliers_fraction))] plt.figure(figsize=(14, 15)) plt.subplots_adjust(left=.02,
def floatrange(start, stop, step): steps = math.floor((stop - start) / step) temp = [] for i in range(steps): temp.append(start + step * i) return temp ''' 离群值检测 ''' plt.figure('Outlier Test', figsize=(9.6, 3.8), dpi=200) # plt.suptitle('Elliptic Envelope contamination', fontsize=20) for num, i in enumerate(list([0.01, 0.03, 0.05, 0.1, 0.2, 0.5])): cov = EllipticEnvelope(random_state=1, contamination=i) cov.fit(np.hstack([X_array, Y_array.reshape(-1, 1)])) index = cov.predict(np.hstack([X_array, Y_array.reshape(-1, 1)])) X_valid, X_invalid = error_wipe(X_array, index) Y_valid, Y_invalid = error_wipe(Y_array, index) reg1 = LinearRegression() reg1.fit(X_valid, Y_valid) reg2 = LinearRegression() reg2.fit(X_array, Y_array) reg1.rmse = RMSE(Y_valid,reg1.predict(X_valid)) reg2.rmse = RMSE(Y_array,reg2.predict(X_array)) print('reg1', reg1.score(X_valid, Y_valid)) print('reg2', reg2.score(X_array, Y_array))
import numpy as np from sklearn.covariance import EllipticEnvelope from sklearn.datasets import make_blobs # Создать симулированные данные features, _ = make_blobs(n_samples = 10, n_features = 2, centers = 1, random_state = 1) # Заменить значения первого наблюдения предельными значениями features[0,0] = 10000 features[0,1] = 10000 # Создать детектор outlier_detector = EllipticEnvelope(contamination=.1) # Выполнить подгонку детектора outlier_detector.fit(features) # Предсказать выбросы outlier_detector.predict(features) # In[6]: # Создать один признак feature = features[:,0] # Создать функцию, которая возращает индекс выбросов
# SVM SVM = svm.OneClassSVM(gamma='auto') detected_results_SVM = SVM.fit_predict(featureData) outliers_SVM = [] for i in range(len(detected_results_SVM)): if detected_results_SVM[i] < 0: outliers_SVM.append(i) SVM_data = np.delete(featureData, (outliers_SVM), axis=0) output_SVM = np.delete(outputLabels, (outliers_SVM), axis=0) # Elliptic envelope EE = EllipticEnvelope() detected_results_EE = EE.fit_predict(featureData) outliers_EE = [] for i in range(len(detected_results_EE)): if detected_results_EE[i] < 0: outliers_EE.append(i) EE_data = np.delete(featureData, (outliers_EE), axis=0) output_EE = np.delete(outputLabels, (outliers_EE), axis=0) # Compare the outlier results accuracy_scores = {} model0 = svm.SVC()
return dic[x] train['type_num'] = train['type'].apply(lambda x: to_number(x, column_number)) # 모델에 적용할 데이터 셋 준비 train_x = train.drop(columns=['type', 'type_num'], axis=1) train_y = train['type_num'] test_x = test # 상관관계: -0.84 (petroMag_u, psfMag_u), (petroMag_u, fiberMag_u) # 이상치 감지 객체 만들기 from sklearn.covariance import EllipticEnvelope outlier_detector = EllipticEnvelope(contamination=.1) # 감지 객체 훈련 outlier_detector.fit(train_x) # 이상치 예측 pred = outlier_detector.predict(train_x) print(pred) print(pred.shape) import numpy as np def find_idx(x): return np.where((x < 0))
from sklearn.svm import OneClassSVM as detector import numpy as np from utility import Utility from sklearn.covariance import EllipticEnvelope #det = detector() det = EllipticEnvelope() # X = np.array([1,2,3,4,5,6,7,8,9,10,1000]).reshape(11,1) # # det.fit(X) # # Y = det.predict(100) # # print(Y) # Find outliers in the interaction rate data # Step 1 - Convert the dataset into pandas series util = Utility.SeriesUtility() datasetFileName = "fans_change_taylor_swift.csv" series = util.convertDatasetsToSeries(datasetFileName) series = util.resampleSeriesSum(series, "D") numberOfPoints = series.data.shape[0] X = series.values.flatten().reshape(numberOfPoints,1) det.fit(X) predicted = det.predict(X)
def ellepticEnvelopeAnomaly(df, outliersFraction): # creation of 4 differents data set based on categories defined before df_class0 = df.loc[df['categories'] == 0, 'value'] df_class1 = df.loc[df['categories'] == 1, 'value'] df_class2 = df.loc[df['categories'] == 2, 'value'] df_class3 = df.loc[df['categories'] == 3, 'value'] # apply ellipticEnvelope(gaussian distribution) at each categories envelope = EllipticEnvelope(contamination=outliersFraction) X_train = df_class0.values.reshape(-1, 1) envelope.fit(X_train) df_class0 = pd.DataFrame(df_class0) df_class0['deviation'] = envelope.decision_function(X_train) df_class0['anomaly'] = envelope.predict(X_train) envelope = EllipticEnvelope(contamination=outliersFraction) X_train = df_class1.values.reshape(-1, 1) envelope.fit(X_train) df_class1 = pd.DataFrame(df_class1) df_class1['deviation'] = envelope.decision_function(X_train) df_class1['anomaly'] = envelope.predict(X_train) envelope = EllipticEnvelope(contamination=outliersFraction) X_train = df_class2.values.reshape(-1, 1) envelope.fit(X_train) df_class2 = pd.DataFrame(df_class2) df_class2['deviation'] = envelope.decision_function(X_train) df_class2['anomaly'] = envelope.predict(X_train) envelope = EllipticEnvelope(contamination=outliersFraction) X_train = df_class3.values.reshape(-1, 1) envelope.fit(X_train) df_class3 = pd.DataFrame(df_class3) df_class3['deviation'] = envelope.decision_function(X_train) df_class3['anomaly'] = envelope.predict(X_train) # add the data to the main df_class = pd.concat([df_class0, df_class1, df_class2, df_class3]) df['anomaly22'] = df_class['anomaly'] df['anomaly22'] = np.array(df['anomaly22'] == -1).astype(int) # visualisation of anomaly throughout time (viz 1) fig, ax = plt.subplots() a = df.loc[df['anomaly22'] == 1, ['time_epoch', 'value']] #anomaly ax.plot(df['time_epoch'], df['value'], color='blue') ax.scatter(a['time_epoch'], a['value'], color='red') ax.set_title('Elliptic Envelope Multi Clustering') plt.show() return df
def plot_raw_overview(filename): event_type = 'all' if filename.name.startswith('sub-drouwen'): CHANS = [f'IH0{x + 1}' for x in range(8)] elif filename.name.startswith('sub-itens'): CHANS = [f'C0{x + 1}' for x in range(8)] elif filename.name.startswith('sub-lemmer'): CHANS = [f'IH{x + 1}' for x in range(8)] elif filename.name.startswith('sub-som705'): CHANS = [f'GA0{x + 1}' for x in range(8)] # a bit random elif filename.name.startswith('sub-ommen'): CHANS = ['chan1', 'chan2'] # I dont 'understand why I cannot use 'chan64' elif filename.name.startswith('sub-vledder') or filename.name.startswith( 'sub-ommen'): CHANS = ['chan1', 'chan64'] elif '_acq-blackrock_' in filename.name: CHANS = ['chan1', 'chan128'] else: print('you need to specify reference channel for this test') return None, None d = Dataset(filename, bids=True) event_names, event_onsets = select_events(d, event_type) is_ecog = d.dataset.task.channels.tsv['type'] == 'ECOG' is_seeg = d.dataset.task.channels.tsv['type'] == 'SEEG' chans = array(d.header['chan_name'])[is_ecog | is_seeg] data = d.read_data(begtime=event_onsets[0], endtime=event_onsets[-1], chan=list(chans)) data.data[0][isnan(data.data[0])] = 0 # ignore nan data = montage(data, ref_chan=CHANS) freq = frequency(data, taper='hann', duration=2, overlap=0.5) hist = make_histogram(data, max=250, step=10) divs = [] fig = plot_hist(hist) divs.append(to_div(fig)) bad_chans = None if AUTOMATIC: from sklearn.covariance import EllipticEnvelope algorithm = EllipticEnvelope( contamination=P['data_quality']['histogram']['contamination']) prediction = algorithm.fit(hist.data[0]).predict(hist.data[0]) new_bad_chans = data.chan[0][prediction == -1] print('bad channels with histogram / elliptic envelope: ' + ', '.join(new_bad_chans)) bad_chans = set(new_bad_chans) fig = plot_outliers(hist.chan[0], algorithm.dist_, prediction, yaxis_title='distance', yaxis_type='log') divs.append(to_div(fig)) fig = plot_freq(freq) divs.append(to_div(fig)) if AUTOMATIC: from sklearn.neighbors import LocalOutlierFactor algorithm = LocalOutlierFactor( n_neighbors=P['data_quality']['spectrum']['n_neighbors']) prediction = algorithm.fit_predict(freq.data[0]) new_bad_chans = data.chan[0][prediction == -1] print('bad channels with spectrum / local outlier factor: ' + ', '.join(new_bad_chans)) bad_chans |= set(new_bad_chans) fig = plot_outliers(freq.chan[0], algorithm.negative_outlier_factor_, prediction, yaxis_title='distance', yaxis_type='linear') divs.append(to_div(fig)) # we use again the reference channel. Ref channel was handpicked but it might have a weird spectrum bad_chans -= set(CHANS) return bad_chans, divs
def Predict(self): if self.ID < 0: self.ErrorMessage.setIcon(QMessageBox.Information) self.ErrorMessage.setText("Your are not logged in") self.ErrorMessage.setWindowTitle("Warning!") self.ErrorMessage.exec_() elif self.String == self.Accounts[self.ID].AccountPassword: y = [] for i in range(len(self.Accounts)): if self.Accounts[self.ID].AccountPassword == self.Accounts[ i].AccountPassword: for x in range(len(self.Accounts[i].TrainData)): y.append(self.Accounts[i].AccountName) sts = len(list(set(y))) self.ProcessData() Xset = [] Yset = [] sz = len(self.Accounts[self.ID].AccountPassword) * 2 for j in range(len(self.Accounts[self.ID].TrainData)): Xset.append(array(self.Accounts[self.ID].TrainData)[j][sz:]) Yset.append(1) Xset = array(Xset) Yset = array(Yset) trainx, testx, trainy, testy = train_test_split(Xset, Yset, test_size=0.3, random_state=2) trainx = array(trainx) X = [] multiy = [] multi2y = [] if sts > 1: for i in range(len(self.Accounts)): if self.Accounts[self.ID].AccountPassword == self.Accounts[ i].AccountPassword and self.ID != i: hold = [] for k in range(len(self.Accounts[i].TrainData)): hold.append(self.Accounts[i].TrainData[k][16:]) X = X + hold for x in range(len(self.Accounts[i].TrainData)): multiy.append(-1) multi2y.append(0) X = array(X) multiy = array(multiy) multi2y = array(multi2y) testx = np.concatenate((testx, X)) testymone = np.concatenate((testy, multiy)) testymzero = np.concatenate((testy, multi2y)) if sts == 1: testymone = testy testymzero = testy Osvm = OneClassSVM(kernel='rbf', gamma="auto").fit(trainx) Ypredict = Osvm.predict(testx) score = f1_score(testymone, Ypredict, pos_label=1) kmeans = KMeans(n_clusters=2, random_state=0).fit(trainx) Ypredict = kmeans.predict(testx) score1 = f1_score(testymzero, Ypredict, pos_label=1) brc = Birch(n_clusters=2, threshold=0.01).fit(trainx) Ypredict = brc.predict(testx) score2 = f1_score(testymzero, Ypredict, pos_label=1) IsF = IsolationForest(contamination=0.01) IsF.fit(trainx) Ypredict = IsF.predict(testx) score3 = f1_score(testymone, Ypredict, pos_label=1) ev = EllipticEnvelope(contamination=0.01) ev.fit(trainx) Ypredict = ev.predict(testx) score4 = f1_score(testymone, Ypredict, pos_label=1) if Osvm.predict([self.Dwell + self.Flight]) == 1: OsvmResult = 'pass' else: OsvmResult = 'fail' if kmeans.predict([self.Dwell + self.Flight]) == 1: kmResult = 'pass' else: kmResult = 'fail' if brc.predict([self.Dwell + self.Flight]) == 1: brcResult = 'pass' else: brcResult = 'fail' if IsF.predict([self.Dwell + self.Flight]) == 1: IsFResult = 'pass' else: IsFResult = 'fail' if ev.predict([self.Dwell + self.Flight]) == 1: evResult = 'pass' else: evResult = 'fail' #print(score,score1,score2,score3,score4) self.TrainText.setText("Score/Model" + " \n" + str(round(score, 2)) + " Osvm: " + OsvmResult + " \n" + str(round(score1, 2)) + " Km: " + kmResult + " \n" + str(round(score2, 2)) + " Brc: " + brcResult + " \n " + str(round(score3, 2)) + " ISF: " + IsFResult + " \n" + str(round(score4, 2)) + " Ev: " + evResult) #if sts > 1: # self.CompareText.setText(self.Accounts[self.ID].AccountPassword) # self.Compare() # prediction = self.clf.predict([self.Dwell+self.Flight]) # str1 = str(prediction) # self.TrainText.setText(str(prediction)) self.Reset() else: self.ErrorMessage.setIcon(QMessageBox.Information) self.ErrorMessage.setText("Your password is wrong") self.ErrorMessage.setWindowTitle("Warning!") self.ErrorMessage.exec_()
'IsMale', 'Race-Black', 'Age', 'HAART-Naive', 'HAART-Non-Adherent', 'HAART-Off', 'HAART-On', 'Hepatitis C status (HCV)'] for col in tranfer_cols: _, cyto_data[col] = cyto_data.align(pat_data[col], join='left', axis = 0) cyto_data['HCV'] = cyto_data['Hepatitis C status (HCV)'] # <codecell> for col in cytos: env = EllipticEnvelope(contamination=0.05) env.fit(cyto_data[col].dropna().values.reshape(-1, 1)) mask = env.predict(cyto_data[col].values.reshape(-1,1)) cyto_data[col][mask==-1] = np.nan # <codecell> fig, axs = plt.subplots(11,3, figsize = (10,20)) for ax, col in zip(axs.flatten(), cytos): boxes = [] mus = [] stds = [] for trop in trops:
from sklearn.ensemble import IsolationForest from sklearn.neighbors import LocalOutlierFactor print(__doc__) matplotlib.rcParams['contour.negative_linestyle'] = 'solid' # Example settings n_samples = 300 outliers_fraction = 0.15 n_outliers = int(outliers_fraction * n_samples) n_inliers = n_samples - n_outliers # define outlier/anomaly detection methods to be compared anomaly_algorithms = [ ("Robust covariance", EllipticEnvelope(contamination=outliers_fraction)), ("One-Class SVM", svm.OneClassSVM(nu=outliers_fraction, kernel="rbf", gamma=0.1)), ("Isolation Forest", IsolationForest(behaviour='new', contamination=outliers_fraction, random_state=42)), ("Local Outlier Factor", LocalOutlierFactor( n_neighbors=35, contamination=outliers_fraction))] # Define datasets blobs_params = dict(random_state=0, n_samples=n_inliers, n_features=2) datasets = [ make_blobs(centers=[[0, 0], [0, 0]], cluster_std=0.5, **blobs_params)[0], make_blobs(centers=[[2, 2], [-2, -2]], cluster_std=[0.5, 0.5], **blobs_params)[0],
def detect(model, clean_dataset, random_trials=100): logger = logging.getLogger(__name__) dummy_input_image = np.zeros((1, *clean_dataset.input_shape)) KLASSES = list(range(clean_dataset.num_classes)) logger.info('Computing saliency...') sms_ = saliency_map_all(model, dummy_input_image) sms_model = [np.linalg.norm(s, ord=2, axis=2, keepdims=True) for s in sms_] logger.info('Finding outleirs...') outs = [] for sms in sms_model: d = sms.reshape(-1, 1) env = EllipticEnvelope() env.fit(d) outliers = env.predict(d).reshape(clean_dataset.input_shape[0], clean_dataset.input_shape[1], 1) outliers[outliers == 1] = 0 outliers[outliers == -1] = 1 outs.append(outliers) AT_LEAST = ceil(clean_dataset.num_classes/2 + 1) recovered = np.stack([s == 1 for s in outs]).sum(axis=0) >= AT_LEAST logger.info('Recovering mask...') mask = np.repeat(recovered, clean_dataset.input_shape[2], axis=2) mask_size = mask.sum() mask_prop = (mask_size/(clean_dataset.input_shape[0] * clean_dataset.input_shape[1])) logger.info('Mask proportion is %.3f', mask_prop) def sample_with_klass(val): klass = clean_dataset.x_test[clean_dataset.y_test_cat == val] while True: idx = np.random.choice(len(klass), size=1)[0] sample = klass[idx] pred = model.predict_classes(sample[np.newaxis, :])[0] if val == pred: return sample else: logger.info('Got misclassified sample, retrying...') logger.info('Sampling one observation per class in the clean dataset...') sample = np.stack([sample_with_klass(val) for val in KLASSES]) maker = patch.pattern_maker(mask_size, dynamic=True) sample_preds = model.predict_classes(sample) logger.info('Predictions are: %s', sample_preds) def apply_mask(sample): _sample = np.copy(sample) _sample[:, mask] = maker() return _sample perturbed = np.stack([apply_mask(sample) for _ in range(random_trials)]) def trial(i): batch = perturbed[:, i, :] batch_preds = model.predict_classes(batch) return batch_preds res = [trial(i) for i in range(10)] return sms_model, outs, recovered, sample, res, mask_prop
covariance_type='full', random_state=random), 'B-GMM-tied': BayesianGaussianMixture(n_components=5, covariance_type='tied', random_state=random), 'B-GMM-diag': BayesianGaussianMixture(n_components=10, covariance_type='diag', random_state=random), 'B-GMM-spherical': BayesianGaussianMixture(n_components=10, covariance_type='spherical', random_state=random), 'EllipticEnvelope': EllipticEnvelope(), } DATASETS = { #'binary': datasets.make_classification(n_classes=2, n_features=7, n_samples=100, random_state=random), #'5way': datasets.make_classification(n_classes=2, n_features=4, n_informative=2, n_samples=6, random_state=random), '5way': datasets.make_classification(n_classes=5, n_features=7, n_informative=5, n_samples=50, random_state=random), } METHODS = [ 'inline', #'pymodule', #'loadable',
# -*- coding: utf-8 -*- import numpy as np from sklearn.covariance import EllipticEnvelope from sklearn.svm import OneClassSVM import matplotlib.pyplot as plt import matplotlib.font_manager from sklearn.datasets import load_boston # Get data X1 = load_boston()['data'][:, [8, 10]] # two clusters X2 = load_boston()['data'][:, [5, 12]] # "banana"-shaped # Define "classifiers" to be used classifiers = { "Empirical Covariance": EllipticEnvelope(support_fraction=1., contamination=0.261), "Robust Covariance (Minimum Covariance Determinant)": EllipticEnvelope(contamination=0.261), "OCSVM": OneClassSVM(nu=0.261, gamma=0.05)} colors = ['m', 'g', 'b'] legend1 = {} legend2 = {} # Learn a frontier for outlier detection with several classifiers xx1, yy1 = np.meshgrid(np.linspace(-8, 28, 500), np.linspace(3, 40, 500)) xx2, yy2 = np.meshgrid(np.linspace(3, 10, 500), np.linspace(-5, 45, 500)) for i, (clf_name, clf) in enumerate(classifiers.items()): fig1a=plt.figure(1) fig1a.set_size_inches(10, 10) clf.fit(X1) Z1 = clf.decision_function(np.c_[xx1.ravel(), yy1.ravel()])
def outliers_from_ellipticEnvelope(): from sklearn.covariance import EllipticEnvelope env=EllipticEnvelope() env.fit(features_pca) outlier_pred=env.decision_function(features_pca).ravel() return outlier_pred
from sklearn.cluster import KMeans import numpy as np from sklearn.decomposition import PCA import matplotlib.pyplot as plt import csv from sklearn import svm from sklearn.covariance import EllipticEnvelope from scipy import stats data=[] with open('newdata.csv', 'rb') as f: rdr=csv.reader(f) for row in rdr: data.append([int(row[1]), int(row[2])]) data=np.array(data) # print(data) outliers_fraction = 0.05 # est=svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05,kernel="rbf", gamma=0.1) est=EllipticEnvelope(contamination=.1) # est=KMeans(n_clusters=3) est.fit(data) # labels=est.labels_ y_pred=est.decision_function(data).ravel() threshold = stats.scoreatpercentile(y_pred, 100 * outliers_fraction) labels=[ (2 if y>threshold else 1) for y in y_pred]; # labels=est.labels_ print(labels) plt.scatter(data[:,0], data[:,1], c=labels, lw=0) plt.show()
print("Size of perturbation: ", len(perturbation_X_test)) print("Size of mixed: ", len(mixed_X_test)) # ============ ALGORITHMS ============ # model1 = LinearRegression() model2 = IsolationForest( n_estimators=200, max_samples=200, contamination=0.1, random_state=100) # model2 = IsolationForest() model3 = OneClassSVM(kernel='linear', gamma='auto', nu=0.1) # fix # model3 = OneClassSVM(kernel='poly', gamma='scale', nu=0.01) # model4 = LocalOutlierFactor( # n_neighbors=300, metric="euclidean", contamination=0.1) model4 = LocalOutlierFactor(n_neighbors=200, algorithm="brute", leaf_size=200, contamination=0.1) # fix # model5 = DBSCAN() model6 = EllipticEnvelope( contamination=0.10, random_state=100, support_fraction=0.1) # fix # model fitting outlier detection # print("====== OUTLIER DETECTION =======") X_train_pred2, X_test_pred2 = model2.fit_predict( df_X_train), model2.fit_predict(df_X_test) X_train_pred3, X_test_pred3 = model3.fit_predict( df_X_train), model3.fit_predict(df_X_test) X_train_pred4, X_test_pred4 = model4.fit_predict( df_X_train), model4.fit_predict(df_X_test) # y_pred5 = model5.fit_predict(df) X_train_pred6, X_test_pred6 = model6.fit_predict( df_X_train), model6.fit_predict(df_X_test) # print("====== NOVELTY DETECTION =======")
from sklearn import svm from sklearn.covariance import EllipticEnvelope # Example settings n_samples = 200 outliers_fraction = 0.25 clusters_separation = [0, 1, 2] # define two outlier detection tools to be compared classifiers = { "One-Class SVM": svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05, kernel="rbf", gamma=0.1), "robust covariance estimator": EllipticEnvelope(contamination=.1) } # Compare given classifiers under given settings xx, yy = np.meshgrid(np.linspace(-7, 7, 500), np.linspace(-7, 7, 500)) n_inliers = int((1. - outliers_fraction) * n_samples) n_outliers = int(outliers_fraction * n_samples) ground_truth = np.ones(n_samples, dtype=int) ground_truth[-n_outliers:] = 0 # Fit the problem with varying cluster separation for i, offset in enumerate(clusters_separation): np.random.seed(42) # Data generation X1 = 0.3 * np.random.randn(0.5 * n_inliers, 2) - offset X2 = 0.3 * np.random.randn(0.5 * n_inliers, 2) + offset
# elliptic envelope for imbalanced classification from sklearn.datasets import make_classification from sklearn.model_selection import train_test_split from sklearn.metrics import f1_score from sklearn.covariance import EllipticEnvelope # generate dataset X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0, n_clusters_per_class=1, weights=[0.999], flip_y=0, random_state=4) # split into train/test sets trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.5, random_state=2, stratify=y) # define outlier detection model model = EllipticEnvelope(contamination=0.01) # fit on majority class trainX = trainX[trainy==0] model.fit(trainX) # detect outliers in the test set yhat = model.predict(testX) # mark inliers 1, outliers -1 testy[testy == 1] = -1 testy[testy == 0] = 1 # calculate score score = f1_score(testy, yhat, pos_label=-1) print('F-measure: %.3f' % score)