def outlier_removal2(features, samples, cv_predict):

    outliers_fraction = 0.1

    print cv_predict.shape
    print samples.shape
    test = np.column_stack((cv_predict, samples))
    #clf = EllipticEnvelope(contamination=.1)
    clf = EllipticEnvelope(contamination=.1)
    #clf = svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05,
    #                                 kernel="rbf", gamma=0.1)
    clf.fit(test)
    y_pred = clf.decision_function(test).ravel()
    threshold = stats.scoreatpercentile(y_pred,
                                        100 * outliers_fraction)

    y_pred_new = y_pred > threshold
    print y_pred_new
    #print samples[y_pred_new]
    print samples.shape
    print samples[y_pred_new].shape
    print features.shape
    print features[y_pred_new].shape

    return features[y_pred_new], samples[y_pred_new]
    def clean_series(self, token, discard=5):

        """
        Remove outliers from the ratio series for a token.

        Args:
            discard (int): Drop the most outlying X% of the data.

        Returns: OrderedDict{year: wpm}
        """

        series = self.ratios[token]

        X = np.array(list(series.values()))[:, np.newaxis]

        env = EllipticEnvelope()
        env.fit(X)

        # Score each data point.
        y_pred = env.decision_function(X).ravel()

        # Get the discard threshold.
        threshold = stats.scoreatpercentile(y_pred, discard)

        return OrderedDict([
            (year, ratio)
            for (year, ratio), pred in zip(series.items(), y_pred)
            if pred > threshold
        ])
    def calc(self,outliers_fraction):
        

        data, dqs, raw = self.get_data()
        clf = EllipticEnvelope(contamination=outliers_fraction)
        X = zip(data['Tbandwidth'],data['Tlatency'],data['Tframerate'])
        clf.fit(X)
        #data['y_pred'] = clf.decision_function(X).ravel()
        #data['y_pred'] = clf.decision_function(X).ravel()
        
        #threshold = np.percentile(data['y_pred'],100 * outliers_fraction)
        data['MDist']=clf.mahalanobis(X)
        
        #picking "bad" outliers, not good ones
        outliers = chi2_outliers(data, [.8,.9,.95], 3)
        #print outliers
        outliers = [i[i['Tbandwidth']<i['Tlatency']] for i in outliers]
        
        #outliers = data[data['y_pred']<threshold]
        #data['y_pred'] = data['y_pred'] > threshold
        #outliers = [x[['ticketid','MDist']].merge(raw, how='inner').drop_duplicates() for x in outliers]
        #print raw
        #outliers = [raw[raw['ticketid'].isin(j['ticketid'])] for j in outliers]
        outliers = [k[k['Tframerate']<(k['Tframerate'].mean()+k['Tframerate'].std())] for k in outliers] #making sure we don't remove aberrantly good framrates
        outliers = [t.sort_values(by='MDist', ascending=False).drop_duplicates().drop(['Tbandwidth','Tlatency','Tframerate'],axis=1) for t in outliers]
        
        #dqs = raw[raw['ticketid'].isin(dqs['ticketid'])]
        #data = data.sort_values('MDist', ascending=False).drop_duplicates()
        
        return outliers, dqs, data.sort_values(by='MDist', ascending=False).drop_duplicates().drop(['Tbandwidth','Tlatency','Tframerate'],axis=1)
Example #4
0
    def filter_remove_outlayers(self, flat, minimum_value=0):
        """
        Remove outlayers using ellicptic envelope from scikits learn
        :param flat:
        :param minimum_value:
        :return:
        """
        from sklearn.covariance import EllipticEnvelope
        flat0 = flat.copy()
        flat0[np.isnan(flat)] = 0
        x,y = np.nonzero(flat0)
        # print np.prod(flat.shape)
        # print len(y)

        z = flat[(x,y)]

        data = np.asarray([x,y,z]).T

        clf = EllipticEnvelope(contamination=.1)
        clf.fit(data)
        y_pred = clf.decision_function(data)


        out_inds = y_pred < minimum_value
        flat[(x[out_inds], y[out_inds])] = np.NaN
        return flat
Example #5
0
 def model_2_determine_test_data_similarity(self,model):
     clf_EE={}
     model_EE={}
     for i in range(len(model)):
         clf=EllipticEnvelope(contamination=0.01,support_fraction=1)
         clf_EE[i]=clf
         EEmodel=clf.fit(model[i])
         model_EE[i]=EEmodel
     return clf_EE,model_EE
Example #6
0
def plot(X, y):
    proj = TSNE().fit_transform(X)
    e = EllipticEnvelope(assume_centered=True, contamination=.25) # Outlier detection
    e.fit(X)

    good = np.where(e.predict(X) == 1)
    X = X[good]
    y = y[good]

    scatter(proj, y)
Example #7
0
def filterOut(x):
    x = np.array(x)
    outliers_fraction=0.05
    #clf = svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05,  kernel="rbf", gamma=0.1) 
    clf = EllipticEnvelope(contamination=outliers_fraction)
    clf.fit(x)
    y_pred = clf.decision_function(x).ravel()
    threshold = stats.scoreatpercentile(y_pred,
                                        100 * outliers_fraction)
    y_pred = y_pred > threshold
    return y_pred
Example #8
0
    def module4(self):
        '''
            入力された一次元配列からanomaly detectionを用いて外れ値を検出する
        '''

        # get data
        img = cv2.imread('../saliency_detection/image/pearl.png')
        b,g,r = cv2.split(img) 
        B,G,R = map(lambda x,y,z: x*1. - (y*1. + z*1.)/2., [b,g,r],[r,r,g],[g,b,b])

        Y = (r*1. + g*1.)/2. - np.abs(r*1. - g*1.)/2. - b*1.
        # 負の部分は0にする
        R[R<0] = 0
        G[G<0] = 0
        B[B<0] = 0
        Y[Y<0] = 0
        rg = cv2.absdiff(R,G)
        by = cv2.absdiff(B,Y)
        img1 = rg
        img2 = by

        rg, by = map(lambda x:x.reshape((len(b[0])*len(b[:,0]),1)),[rg,by])
        data = np.hstack((rg,by))
        data = data.astype(np.float64)
        data = np.delete(data, range( 0,len(data[:,0]),2),0)

        # grid
        xx1, yy1 = np.meshgrid(np.linspace(-10, 300, 500), np.linspace(-10, 300, 500))
        
        # 学習して境界を求める # contamination大きくすると円は小さく
        clf = EllipticEnvelope(support_fraction=1, contamination=0.01)
        print 'data.shape =>',data.shape
        print 'learning...'
        clf.fit(data) #学習 # 0があるとだめっぽいかも
        print 'complete learning!'

        # 学習した分類器に基づいてデータを分類して楕円を描画
        z1 = clf.decision_function(np.c_[xx1.ravel(), yy1.ravel()])
        z1 = z1.reshape(xx1.shape)
        plt.contour(xx1,yy1,z1,levels=[0],linewidths=2,colors='r')

        # plot
        plt.scatter(data[:,0],data[:,1],color= 'black')
        plt.title("Outlier detection")
        plt.xlim((xx1.min(), xx1.max()))
        plt.ylim((yy1.min(), yy1.max()))
        plt.pause(.001)
        # plt.show()
        
        cv2.imshow('rg',img1/np.amax(img1))
        cv2.imshow('by',img2/np.amax(img2))
Example #9
0
def find_outlier_test_homes(df,all_homes,  appliance, outlier_features, outliers_fraction=0.1):
    from scipy import stats

    from sklearn import svm
    from sklearn.covariance import EllipticEnvelope
    clf = EllipticEnvelope(contamination=.1)
    try:
        X = df.ix[all_homes[appliance]][outlier_features].values
        clf.fit(X)
    except:
        try:
            X = df.ix[all_homes[appliance]][outlier_features[:-1]].values
            clf.fit(X)
        except:
            try:
                X = df.ix[all_homes[appliance]][outlier_features[:-2]].values
                clf.fit(X)
            except:
                print "outlier cannot be found"
                return df.ix[all_homes[appliance]].index.tolist()


    y_pred = clf.decision_function(X).ravel()
    threshold = stats.scoreatpercentile(y_pred,
                                        100 * outliers_fraction)
    y_pred = y_pred > threshold
    return df.ix[all_homes[appliance]][~y_pred].index.tolist()
Example #10
0
def ellipticenvelope(data, fraction = 0.02):
    elenv = EllipticEnvelope(contamination=fraction)
    elenv.fit(data)
    score = elenv.predict(data)

    numeration = [[i] for i in xrange(1, len(data)+1, 1)]
    numeration = np.array(numeration)
    y = np.hstack((numeration, score))

    anomalies = numeration
    for num,s in y:
        if (y == 1):
            y = np.delete(anomalies, num-1, axis=0)

    return anomalies
Example #11
0
def elliptic_envelope(df, modelDir, norm_confidence=0.95):
	from sklearn.covariance import EllipticEnvelope
	from scipy.stats import normaltest

	if "ds" in df.columns:
		del df["ds"]
	model = EllipticEnvelope()
	test_stats, p_vals = normaltest(df.values, axis=0)
	normal_cols = p_vals >= (1 - norm_confidence)
	df = df.loc[:, normal_cols]
	if df.shape[1] == 0:
		return None
	df.outlier = model.fit_predict(df.values)
	df.outlier = df.outlier < 0  # 1 if inlier, -1 if outlier
	return df
def labelValidSkeletons(skel_file, valid_index, trajectories_data, fit_contamination = 0.05):
    #calculate valid widths if they were not used
    calculate_widths(skel_file)
    
    #calculate classifier for the outliers    
    X4fit = nodes2Array(skel_file, valid_index)        
    clf = EllipticEnvelope(contamination = fit_contamination)
    clf.fit(X4fit)
    
    #calculate outliers using the fitted classifier
    X = nodes2Array(skel_file) #use all the indexes
    y_pred = clf.decision_function(X).ravel() #less than zero would be an outlier

    #labeled rows of valid individual skeletons as GOOD_SKE
    trajectories_data['auto_label'] = ((y_pred>0).astype(np.int))*wlab['GOOD_SKE'] #+ wlab['BAD']*np.isnan(y_prev)
    saveLabelData(skel_file, trajectories_data)
def test_score_samples():
    X_train = [[1, 1], [1, 2], [2, 1]]
    clf1 = EllipticEnvelope(contamination=0.2).fit(X_train)
    clf2 = EllipticEnvelope().fit(X_train)
    assert_array_equal(clf1.score_samples([[2., 2.]]),
                       clf1.decision_function([[2., 2.]]) + clf1.offset_)
    assert_array_equal(clf2.score_samples([[2., 2.]]),
                       clf2.decision_function([[2., 2.]]) + clf2.offset_)
    assert_array_equal(clf1.score_samples([[2., 2.]]),
                       clf2.score_samples([[2., 2.]]))
Example #14
0
def labelValidSkeletons(skel_file):
    calculate_widths(skel_file)
    
    #get valid rows using the trajectory displacement and the skeletonization success
    valid_index, trajectories_data = getValidIndexes(skel_file)
    
    #calculate classifier for the outliers    
    X4fit = nodes2Array(skel_file, valid_index)        
    clf = EllipticEnvelope(contamination=.1)
    clf.fit(X4fit)
    
    #calculate outliers using the fitted classifier
    X = nodes2Array(skel_file)
    y_pred = clf.decision_function(X).ravel() #less than zero would be an outlier

    #labeled rows of valid individual skeletons as GOOD_SKE
    trajectories_data['auto_label'] = ((y_pred>0).astype(np.int))*wlab['GOOD_SKE'] #+ wlab['BAD']*np.isnan(y_prev)
    saveLabelData(skel_file, trajectories_data)
def test_outlier_detection():
    rnd = np.random.RandomState(0)
    X = rnd.randn(100, 10)
    clf = EllipticEnvelope(contamination=0.1)
    assert_raises(NotFittedError, clf.predict, X)
    assert_raises(NotFittedError, clf.decision_function, X)
    clf.fit(X)
    y_pred = clf.predict(X)
    decision = clf.decision_function(X, raw_values=True)
    decision_transformed = clf.decision_function(X, raw_values=False)

    assert_array_almost_equal(decision, clf.mahalanobis(X))
    assert_array_almost_equal(clf.mahalanobis(X), clf.dist_)
    assert_almost_equal(clf.score(X, np.ones(100)), (100 - y_pred[y_pred == -1].size) / 100.0)
    assert sum(y_pred == -1) == sum(decision_transformed < 0)
def test_outlier_detection():
    rnd = np.random.RandomState(0)
    X = rnd.randn(100, 10)
    clf = EllipticEnvelope(contamination=0.1)
    clf.fit(X)
    y_pred = clf.predict(X)

    assert_array_almost_equal(
        clf.decision_function(X, raw_values=True), clf.mahalanobis(X))
    assert_array_almost_equal(clf.mahalanobis(X), clf.dist_)
    assert_almost_equal(clf.score(X, np.ones(100)),
                        (100 - y_pred[y_pred == -1].size) / 100.)
def labelValidSkeletons_old(skeletons_file, good_skel_row, fit_contamination = 0.05):
    base_name = getBaseName(skeletons_file)
    progress_timer = timeCounterStr('');
    
    print_flush(base_name + ' Filter Skeletons: Starting...')
    with pd.HDFStore(skeletons_file, 'r') as table_fid:
        trajectories_data = table_fid['/trajectories_data']

    trajectories_data['is_good_skel'] = trajectories_data['has_skeleton']
    
    if good_skel_row.size > 0:
        #nothing to do if there are not valid skeletons left. 
        
        print_flush(base_name + ' Filter Skeletons: Reading features for outlier identification.')
        #calculate classifier for the outliers    
        
        nodes4fit = ['/skeleton_length', '/contour_area'] + \
        ['/' + name_width_fun(part) for part in worm_partitions]
        
        X4fit = nodes2Array(skeletons_file, nodes4fit, good_skel_row)
        assert not np.any(np.isnan(X4fit))
        
        #%%
        print_flush(base_name + ' Filter Skeletons: Fitting elliptic envelope. Total time:' + progress_timer.getTimeStr())
        #TODO here the is a problem with singular covariance matrices that i need to figure out how to solve
        clf = EllipticEnvelope(contamination = fit_contamination)
        clf.fit(X4fit)
        
        print_flush(base_name + ' Filter Skeletons: Calculating outliers. Total time:' + progress_timer.getTimeStr())
        #calculate outliers using the fitted classifier
        X = nodes2Array(skeletons_file, nodes4fit) #use all the indexes
        y_pred = clf.decision_function(X).ravel() #less than zero would be an outlier

        print_flush(base_name + ' Filter Skeletons: Labeling valid skeletons. Total time:' + progress_timer.getTimeStr())
        #labeled rows of valid individual skeletons as GOOD_SKE
        trajectories_data['is_good_skel'] = (y_pred>0).astype(np.int)
    
    #Save the new is_good_skel column
    saveModifiedTrajData(skeletons_file, trajectories_data)

    print_flush(base_name + ' Filter Skeletons: Finished. Total time:' + progress_timer.getTimeStr())
def test_elliptic_envelope():
    rnd = np.random.RandomState(0)
    X = rnd.randn(100, 10)
    clf = EllipticEnvelope(contamination=0.1)
    assert_raises(NotFittedError, clf.predict, X)
    assert_raises(NotFittedError, clf.decision_function, X)
    clf.fit(X)
    y_pred = clf.predict(X)
    scores = clf.score_samples(X)
    decisions = clf.decision_function(X)

    assert_array_almost_equal(
        scores, -clf.mahalanobis(X))
    assert_array_almost_equal(clf.mahalanobis(X), clf.dist_)
    assert_almost_equal(clf.score(X, np.ones(100)),
                        (100 - y_pred[y_pred == -1].size) / 100.)
    assert(sum(y_pred == -1) == sum(decisions < 0))
Example #19
0
def transform( features, labels ):

#    for ff, ll in zip(features, labels):
#        print ll, ff
#    for rr in range(0, len(features) ):
#        features[rr] = scaler.fit_transform( features[rr] )

    print "transforming features via pca"
    pca = PCA(n_components = 30)
    features = pca.fit_transform( features )

    envelope = EllipticEnvelope()
    envelope.fit( features )
    print envelope.predict( features )

    scaler = MinMaxScaler()
    features = scaler.fit_transform( features )



    return features, labels
def detect_outliers(X, station):
    if station=='hoerning':
            outlierfraction = 0.0015
            classifier = svm.OneClassSVM(nu=0.95*outlierfraction + 0.05,
                                         kernel='rbf', gamma=0.1)
            Xscaler = StandardScaler(copy=True, with_mean=True, with_std=True).fit(X)
            X_scaled = Xscaler.transform(X)
            classifier.fit(X_scaled)
            svcpred = classifier.decision_function(X_scaled).ravel()
            threshold = stats.scoreatpercentile(svcpred, 100*outlierfraction)
            inlierpred = svcpred>threshold        
            
    else:
        outlierfraction = 0.0015
        classifier = EllipticEnvelope(contamination=outlierfraction)
        classifier.fit(X)
        gausspred = classifier.decision_function(X).ravel()
        threshold = stats.scoreatpercentile(gausspred, 100*outlierfraction)
        inlierpred = gausspred>threshold
            
    return inlierpred
Example #21
0
def CovEstOD(data, classifier=None, N=1, **kw):
    if classifier is None:
        from sklearn.covariance import EllipticEnvelope
        contamination = N / data.shape[0]
        classifier = EllipticEnvelope(support_fraction=1., contamination=contamination)

    classifier.fit(data)
    clipix, = np.where( classifier.predict(data) == -1)
    
    wdb = kw.pop( 'with_decision_boundary', False )
    #TODO:  A better way of finding the decision boundary
    if wdb:
        w,T = np.linalg.eigh( clf.precision_ )          #T (eigenvectors of precision matrix) is the transformation matrix between principle axes and data coordinates
        Ti = np.linalg.inv(T)
        M = np.dot(Ti, clf.precision_) * T              #Diagonalizing the precision matrix ==> quadratic representation of decision boundary (ellipse): z^T M z = threshold. where x-<x> = Tz transforms to principle axes
        a, b = np.sqrt(clf.threshold / np.diag(M))      #semi-major & semi-minor axes
        theta = np.degrees( np.arccos(T[0,0]) )         #T is (im)proper rotation matrix
        theta = np.linalg.det(T) * theta                #If det(T)=-1 ==> improper rotation matrix (rotoinversion - one of the axes is inverted)
        decision_boundary = Ellipse( clf.location_, 2*a, 2*b, theta,  color='m' )
        return clipix, decision_boundary
    else:
        return clipix
Example #22
0
def find_outlier_train(ser, outliers_fraction=0.1, min_units=0.2):
    # Returns outlier, inliers

    X = ser[ser>min_units].reshape(-1,1)
    #is_normal_data = is_normal(ser)
    # FOR NOW only using Robust estimator of Covariance
    is_normal_data = True
    if is_normal_data:
        # Use robust estimator of covariance
        from sklearn.covariance import EllipticEnvelope
        clf = EllipticEnvelope(contamination=.1)
    else:
        #Data is not normally distributed, use OneClassSVM based outlier detection
        from sklearn import svm
        clf = svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05,
                                     kernel="rbf", gamma=0.1)
    from scipy import stats

    clf.fit(X)
    y_pred = clf.decision_function(X).ravel()
    threshold = stats.scoreatpercentile(y_pred,
                                        100 * outliers_fraction)
    y_pred = y_pred > threshold
    return ser[ser>min_units][~y_pred], ser[ser>min_units][y_pred]
def anomaly_detection(features, labels):
	# In this function, I try to use anomaly detection method (using mutivariate gaussian distribution) to identify poi-s
	non_pois = features[labels==0]
	pois = features[labels==1]
	print "non poi size", non_pois.shape, pois.shape, features.shape

	## Spliting data to train, test and cross validation set for anomaly detection

	split1 = produce_spliting_array(non_pois.shape[0], .75 )
	X_train = non_pois[split1==1]

	X_intermediate = non_pois[split1==0]

	print "size intermediate", X_intermediate.shape

	split2 = produce_spliting_array(X_intermediate.shape[0], .5 )

	X_test = X_intermediate[split2==1]
	label_test = np.zeros((X_test.shape[0],), dtype=np.int) - 1

	X_cv = X_intermediate[split2==0]
	label_cv = np.zeros((X_cv.shape[0],), dtype=np.int) - 1

	split3 = produce_spliting_array(pois.shape[0], .5 )
	X_test = np.vstack((X_test, pois[split3==1]))
	label_test = np.hstack((label_test, np.ones(sum(split3), dtype=np.int)))

	X_cv = np.vstack((X_cv, pois[split3==0]))
	label_cv = np.hstack((label_cv, np.ones(sum(split3==0), dtype=np.int)))



	print "size X_train", X_train.shape
	print "size test data", X_test.shape, label_test.shape
	print "size cv data", X_cv.shape, label_cv.shape
	print "size splits", len(split1), len(split2), len(split3)

	from sklearn.covariance import EllipticEnvelope
	detector = EllipticEnvelope(contamination=.85)
	detector.fit(X_train)
	pred_cv = detector.predict(X_cv)
	print pred_cv
	print label_cv
	print detector.score(X_cv, label_cv)
cls_nums=np.array(np.unique(y_trn, return_counts=True)).T
ol_label=cls_nums[0,0]
ol_count=cls_nums[0,1]
il_label=cls_nums[1,0]
il_count=cls_nums[1,1]
if cls_nums[0,1]>cls_nums[1,1]:
    ol_label=cls_nums[1,0]
    ol_count=cls_nums[1,1]
    il_label=cls_nums[0,0]
    il_count=cls_nums[0,1]

outlier_fraction = ol_count/n_samples
print("Outlier fraction: {}".format(outlier_fraction))

clsf_names=["Robust covariance", "One-class SVM", "Isolation Forest","Local Outlier Factor"]
anomaly_algorithms = [EllipticEnvelope(contamination=outlier_fraction),
                      svm.OneClassSVM(nu=outlier_fraction, kernel="rbf",gamma=0.1),
                      IsolationForest(contamination=outlier_fraction,random_state=42),
                      LocalOutlierFactor(n_neighbors=35, contamination=outlier_fraction)]

if args.admodel<0 or args.admodel>3:
    print("Anomal detection algorithm ID should be between 0 and 3")
    exit()

clsf = anomaly_algorithms[args.admodel]
clsf.fit(x_trn)
if clsf_names[args.admodel] == "Local Outlier Factor":
    y_pred = clsf.fit_predict(x_tst)
else:
    y_pred = clsf.predict(x_tst)
def test_elliptic_envelope():
    rnd = np.random.RandomState(0)
    X = rnd.randn(100, 10)
    clf = EllipticEnvelope(contamination=0.1)
    with pytest.raises(NotFittedError):
        clf.predict(X)
    with pytest.raises(NotFittedError):
        clf.decision_function(X)
    clf.fit(X)
    y_pred = clf.predict(X)
    scores = clf.score_samples(X)
    decisions = clf.decision_function(X)

    assert_array_almost_equal(scores, -clf.mahalanobis(X))
    assert_array_almost_equal(clf.mahalanobis(X), clf.dist_)
    assert_almost_equal(clf.score(X, np.ones(100)),
                        (100 - y_pred[y_pred == -1].size) / 100.)
    assert (sum(y_pred == -1) == sum(decisions < 0))
Example #26
0
def find_outliers(datestart,dateend,plot=False,cut=-0.05):
    numtopics=84

    di=datetime2str2(datestart)
    dfin=datetime2str2(dateend)

    #print di,dfin
    if dfin<di:
        temp=dfin
        dfin=di
        di=temp
    #print di,dfin
    
    afile="/home/ubuntu/mysql_insightwiki_auth.txt"
    a=open(afile)
    passwd=a.readline().rstrip()
    a.close()
    host='localhost'; user='******';db='wikidata'
    con = mdb.connect(host, user, passwd, db)#,port=3307)
     
    with con:
        curt= con.cursor()
        #sql="SELECT COUNT(*) FROM `topics` "
        
        sql="SELECT `Id`,`topic_label`,`topic_string` FROM `topics`;"
        curt.execute(sql)
        topics=[[0,'nothing','Filler to match index']]
        for topic in curt:
            topics.append(topic)

    data={}
        
    df=range(numtopics+1)
    with con:
        curt= con.cursor()
        sql="SELECT `Id`,`topic_label`,`topic_string` FROM `topics`;"
        curt.execute(sql)
        for row in curt:
            cur = con.cursor()
            sql='''SELECT `page_views`.`dateonly` AS `vd`, AVG(`page_views`.`count`) AS `vc`, 
                `topics`.`topic_label`,`topics`.`topic_string` 
                FROM `topics` INNER JOIN `page_views` ON `topics`.`ID` = `page_views`.`topic_id` 
                WHERE `topic_id`=%s GROUP BY `page_views`.`dateonly`   '''
            data[row[1]]=read_sql(sql, con,params=[row[0]])
            df[row[0]]=data[row[1]]
    
    topicdata=df
    
    d=topicdata[topics[3][0]]
    p=d[ (d['vd']>di) & (d['vd']<dfin )]['vc'].values    
    topicdata=df
    
    #initializing array to hold the rows to cluster
    #the 0th position is fake so that my index matches the sql index
    clusinp=[]
    clusinp.append(gen_feat([0,0,0,0,0]))
    
    chinaoff=6000
    #populating my array to go into my Kmean
    for index,topic in enumerate(topics):
        #topic=list(topics[index])
        if topic[0]!=0:
            d=topicdata[topic[0]]
            ppre=d[ (d['vd']>di) & (d['vd']<dfin )]['vc'].values
            p=gen_feat(ppre)
            if topic[0]==52:
                p=gen_feat([x-chinaoff if x-chinaoff>=0 else 0 for x in ppre  ])
            clusinp.append(p)
    
    #cleaning up my array making it numpy to go into my kmean
    clusinp=np.array(clusinp)
    clusinp[0]=clusinp[5] #making sure my through away first row matches in size
    #contam=0.325
    contamfix=0.1
    
    colors = ['m', 'g', 'b']
    X1=clusinp
    xx1, yy1 = np.meshgrid(np.linspace(0, 10000, 500), np.linspace(-1.5, 1.5, 500))
    ee=EllipticEnvelope(support_fraction=1., contamination=contamfix)
    #ee=OneClassSVM(nu=contam2, gamma=0.05,kernel='rbf')
    ee.fit(clusinp)
    outliers=ee.decision_function(X1, raw_values=False)
    
    if plot==True:
        print "here"
        get_ipython().magic(u'matplotlib inline')
        Z1 = ee.decision_function(np.c_[xx1.ravel(), yy1.ravel()])    
        Z1 = Z1.reshape(xx1.shape)
        legend1 = plt.contour(xx1, yy1, Z1, levels=[0], linewidths=2, colors=colors[1])
        plt.scatter(X1[:, 0], X1[:, 1], color='black')
        plt.xlim((xx1.min(), xx1.max()))
        plt.ylim((yy1.min(), yy1.max()))
        plt.show()

    out=[]
    for index,outlier in enumerate(outliers):
        row=[index,outlier,topics[index][1],int(np.round(clusinp[index][0])),int(np.round(100*clusinp[index][1]))]
        #row=[index,outlier,topics[index][1],int(np.round(clusinp[index][0])),clusinp[index][1]]
        if outlier<cut and index!=0 and row[3]>8:
            out.append(row)
            #print index,outlier,topics[index][2],clusinp[index][0],clusinp[index][1]
    #out=sorted(out,operator.itemgetter(4))
    #out.sort()
    out=sorted(out,key =lambda x:-x[4])
    return out
Example #27
0
                        # ,
                        # label=target_name.decode('utf8')
                        )

            x, y = find_boundary(X_transformed[kclusters == i, 0],
                                 X_transformed[kclusters == i, 1], 5)
            plt.plot(x, y, '-k', lw=2., color=cluster_color)

            # create a mesh to plot in
            h = .02  # step size in the mesh
            x_min, x_max = X_transformed[kclusters == i, 0].min() - 1, X_transformed[kclusters == i, 0].max() + 1
            y_min, y_max = X_transformed[kclusters == i, 1].min() - 1, X_transformed[kclusters == i, 1].max() + 1
            xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                                 np.arange(y_min, y_max, h))

            clf = EllipticEnvelope(contamination=.1)
            clf.fit(X_transformed[kclusters == i])

            pred = clf.decision_function(X_transformed[kclusters == i]).ravel()
            threshold = stats.scoreatpercentile(pred,
                                                100 * outliers_fraction)
            print("INFO: Cluster: ", i, " Threshold: ", threshold)

            Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])

            Z = Z.reshape(xx.shape)
            # plt.contour(xx, yy, Z,
            #             levels=[threshold],
            #             linewidths=2,
            #             linestyles='solid',
            #             colors=(cluster_color,))
Example #28
0
 def __getRemovedOutlierRobustDf(self, _df, _column_drop=True):    
   robust = EllipticEnvelope(contamination=OUTLIER_FRACTION)
   return self.__getRemoveOutlierDf(_df, robust, self.column_name_robust, _column_drop)
def elliptic_envelope(series, contamination=0.1):
    clf = EllipticEnvelope(contamination=contamination, random_state=0)
    series = series.values.reshape(-1, 1)
    clf.fit(series)
    return clf.predict(series)
        "robust covariance estimator": EllipticEnvelope(contamination=.1)}

    # Compare given classifiers under given settings
    xx, yy = np.meshgrid(np.linspace(-0.1, 1.1, 1000), np.linspace(0, 100, 1000))
    n_inliers = int((1. - outliers_fraction) * n_samples)
    n_outliers = int(outliers_fraction * n_samples)

    # Fit the problem with varying cluster separation
    np.random.seed(42)
    # Data generation


    # Fit the model with the One-Class SVM
    #plt.figure(figsize=(10, 5))

    clf = EllipticEnvelope(contamination=.1)
    # fit the data and tag outliers
    clf.fit(XY)
    y_pred = clf.decision_function(XY).ravel()
    threshold = stats.scoreatpercentile(y_pred,
                                        100 * outliers_fraction)
    y_pred = y_pred > threshold
    # plot the levels lines and the points
    Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    subplot = ax[i]
    subplot.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7),
                     cmap=plt.cm.Blues_r)
    a = subplot.contour(xx, yy, Z, levels=[threshold],
                        linewidths=2, colors='red')
    subplot.contourf(xx, yy, Z, levels=[threshold, Z.max()],
Example #31
0
def anomaly_detection_AUC_experiment_batch(anomaly_method, dataset, X_train_in_folds, X_test_in_folds, y_train_in_folds, y_test_in_folds):
    rng = np.random.RandomState(42)
    n_folds = len(X_train_in_folds)
    auc_test_array = np.zeros((n_folds,))
    auc_train_array = np.zeros((n_folds,))
    time_of_algorithm_test = np.zeros((n_folds,))
    time_of_algorithm_train = np.zeros((n_folds,))
    for fold_index in range(n_folds):
        X_train = X_train_in_folds[fold_index]
        X_test = X_test_in_folds[fold_index]
        y_train = y_train_in_folds[fold_index]
        y_test = y_test_in_folds[fold_index]
        if fold_index == 0:
            y = list(y_train)
            y.extend(y_test)
            y = np.asarray(y)
            # print(y)
            percentage_of_anomalies = sum(y == -1) / len(y)
            print("percentage of the anomalies = " + str(percentage_of_anomalies))
        if anomaly_method == "iso_forest":
            clf = IsolationForest(random_state=rng)
            start = time.time()
            clf.fit(X=X_train)
            scores_train = clf.decision_function(X=X_train)
            end = time.time()
            time_of_algorithm_train[fold_index] = end - start
            start = time.time()
            scores_test = clf.decision_function(X=X_test)
            end = time.time()
            time_of_algorithm_test[fold_index] = end - start
        elif anomaly_method == "one_class_SVM":
            clf = OneClassSVM(gamma='auto')
            start = time.time()
            clf.fit(X=X_train)
            scores_train = clf.decision_function(X=X_train)
            end = time.time()
            time_of_algorithm_train[fold_index] = end - start
            start = time.time()
            scores_test = clf.decision_function(X=X_test)
            end = time.time()
            time_of_algorithm_test[fold_index] = end - start
        elif anomaly_method == "LOF":
            n_neighbors = 10
            clf = LOF(n_neighbors=n_neighbors, contamination=0.1)
            start = time.time()
            clf.fit(X=X_train)
            scores_train = clf.negative_outlier_factor_
            end = time.time()
            time_of_algorithm_train[fold_index] = end - start
            clf = LOF(n_neighbors=n_neighbors, novelty=True, contamination=0.1)
            start = time.time()
            clf.fit(X=X_train)
            scores_test = clf.decision_function(X=X_test)
            end = time.time()
            time_of_algorithm_test[fold_index] = end - start
        elif anomaly_method == "covariance_estimator":
            clf = EllipticEnvelope(random_state=rng)
            start = time.time()
            clf.fit(X=X_train)
            scores_train = clf.decision_function(X=X_train)
            end = time.time()
            time_of_algorithm_train[fold_index] = end - start
            start = time.time()
            scores_test = clf.decision_function(X=X_test)
            end = time.time()
            time_of_algorithm_test[fold_index] = end - start
        elif anomaly_method == "iMondrian_forest":
            settings, data, param, cache, train_ids_current_minibatch = MondrianForest.prepare_training_data(X=X_train, num_trees=100)
            clf = MondrianForest(settings, data)
            subsampling_size = 256
            start = time.time()
            # clf.fit(data, train_ids_current_minibatch, settings, param, cache, subsampling_size=None)
            clf.fit(data, train_ids_current_minibatch, settings, param, cache, subsampling_size=subsampling_size)
            scores, scores_shifted = clf.get_anomaly_scores(test_data=X_train, settings=settings, subsampling_size=None)
            scores_train = scores_shifted
            end = time.time()
            time_of_algorithm_train[fold_index] = end - start
            start = time.time()
            scores, scores_shifted = clf.get_anomaly_scores(test_data=X_test, settings=settings, subsampling_size=None)
            scores_test = scores_shifted
            end = time.time()
            time_of_algorithm_test[fold_index] = end - start
        # scores_test = -1 * scores_test  #--> to have: the more score, the less anomaly
        fpr_test, tpr_test, thresholds_test = metrics.roc_curve(y_test, scores_test, pos_label=1) #--> https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_curve.html
        fpr_train, tpr_train, thresholds_train = metrics.roc_curve(y_train, scores_train, pos_label=1)
        # plt.plot(fpr_test, tpr_test)
        # plt.show()
        # plt.plot(fpr_train, tpr_train)
        # plt.show()
        auc_test = metrics.auc(fpr_test, tpr_test)  #--> https://scikit-learn.org/stable/modules/generated/sklearn.metrics.auc.html
        print("Fold: " + str(fold_index) + " ---> AUC for test: " + str(auc_test))
        auc_test_array[fold_index] = auc_test
        auc_train = metrics.auc(fpr_train, tpr_train)
        print("Fold: " + str(fold_index) + " ---> AUC for train: " + str(auc_train))
        auc_train_array[fold_index] = auc_train
    auc_test_mean = auc_test_array.mean()
    auc_test_std = auc_test_array.std()
    auc_train_mean = auc_train_array.mean()
    auc_train_std = auc_train_array.std()
    time_of_algorithm_train_mean = time_of_algorithm_train.mean()
    time_of_algorithm_train_std = time_of_algorithm_train.std()
    time_of_algorithm_test_mean = time_of_algorithm_test.mean()
    time_of_algorithm_test_std = time_of_algorithm_test.std()
    print("Average AUC for test data: " + str(auc_test_mean) + " +- " + str(auc_test_std))
    print("Average time for test data: " + str(time_of_algorithm_test_mean) + " +- " + str(time_of_algorithm_test_std))
    print("Average AUC for train data: " + str(auc_train_mean) + " +- " + str(auc_train_std))
    print("Average time for train data: " + str(time_of_algorithm_train_mean) + " +- " + str(time_of_algorithm_train_std))
    if anomaly_method == "LOF" or anomaly_method == "CAD":
        path = './output/batch/' + dataset + "/" + anomaly_method + "/neigh=" + str(n_neighbors) + "/"
    else:
        path = './output/batch/' + dataset + "/" + anomaly_method + "/"
    save_np_array_to_txt(variable=auc_test_array, name_of_variable="auc_test_array", path_to_save=path)
    save_np_array_to_txt(variable=auc_test_mean, name_of_variable="auc_test_mean", path_to_save=path)
    save_np_array_to_txt(variable=auc_test_std, name_of_variable="auc_test_std", path_to_save=path)
    save_np_array_to_txt(variable=auc_train_array, name_of_variable="auc_train_array", path_to_save=path)
    save_np_array_to_txt(variable=auc_train_mean, name_of_variable="auc_train_mean", path_to_save=path)
    save_np_array_to_txt(variable=auc_train_std, name_of_variable="auc_train_std", path_to_save=path)
    save_np_array_to_txt(variable=time_of_algorithm_test, name_of_variable="time_of_algorithm_test", path_to_save=path)
    save_np_array_to_txt(variable=time_of_algorithm_test_mean, name_of_variable="time_of_algorithm_test_mean", path_to_save=path)
    save_np_array_to_txt(variable=time_of_algorithm_test_std, name_of_variable="time_of_algorithm_test_std", path_to_save=path)
    save_np_array_to_txt(variable=time_of_algorithm_train, name_of_variable="time_of_algorithm_train", path_to_save=path)
    save_np_array_to_txt(variable=time_of_algorithm_train_mean, name_of_variable="time_of_algorithm_train_mean", path_to_save=path)
    save_np_array_to_txt(variable=time_of_algorithm_train_std, name_of_variable="time_of_algorithm_train_std", path_to_save=path)
    save_np_array_to_txt(variable=percentage_of_anomalies, name_of_variable="percentage_of_anomalies", path_to_save=path)
Example #32
0
from sklearn.covariance import EllipticEnvelope
import numpy as np
# aaa = np.array([[1,2,-10000,3,4,6,7,8,90,100,5000]])
aaa = np.transpose(aaa)
# sklearn는 대부분 벡터로 인풋을 받음

outlier = EllipticEnvelope(contamination=.1)
# "outlier가 10퍼센트 있다고 간주하고 이상치를 찾아라"
# 통상적으로 10퍼센트 미만으로 설정함
# 가우스 분포와 공분산을 사용하여 처리하는 것이 차이
outlier.fit(aaa)

print(outlier.predict(aaa))

# contamination=.3
# [ 1  1 -1  1  1  1  1  1  1 -1 -1]
# contamination=.2
# [ 1  1 -1  1  1  1  1  1  1  1 -1]
# contamination=.1
# [ 1  1 -1  1  1  1  1  1  1  1  1]

# ======================
# 1차원 말고 2차원도 될까?
# 응, 된다
# 기준은 열

# aaa = np.array([[1,2,3,4,10000,6,7,5000,90,100],
#                 [1000,2000,3,4000,5000,6000,7000,8,9000,10000]])

# [ 1  1  1  1 -1  1  1  1  1  1]
# "(10, 2) 크기의 데이터 전체를 봤을 때 5 row에 어딘가 이상치가 있다"
Example #33
0
def AnomalyDetection(filepath):
	train_X = np.loadtxt(filepath+'normalized_train_file.csv', delimiter=',', dtype=float, skiprows=1)
	test_X = np.loadtxt(filepath+'pseudonormalized_test_file.csv', delimiter=',',dtype=float, skiprows=1)
	train_Y = np.loadtxt(filepath+'Y_train_file.csv', delimiter=',',dtype=float, skiprows=1)
	test_Y = np.loadtxt(filepath+'Y_test_file.csv', delimiter=',', dtype=float, skiprows=1)
	input_dimensions = str(train_X.shape[1]) #feature length
	samples_size =str(train_X.shape[0]) #number of rows
	input_dimensions_test = str(test_X.shape[1] )#feature length
	samples_size_test = str(test_X.shape[0]) #number of rows
	num_failed_train = train_Y[train_Y==1].shape[0]
	num_failed_test = test_Y[test_Y==1].shape[0]

	with open(filepath+'outliers_new_results.txt', 'w') as output:
		output.write("===== DATA INFORMATION =====\n")
		output.write('training data size: ' +samples_size +' by '+ input_dimensions+'\n')
		output.write('test data size: '  +samples_size_test +' by '+ input_dimensions_test+'\n')
		output.write('failed points in training: ' + str(num_failed_train))
		output.write('failed points in testing: ' + str(num_failed_test))

		#change input data for this method:
		training = train_X[np.where(train_Y==0)]
		testing = np.concatenate((test_X,train_X[np.where(train_Y==1)]))
		testing_Y =  np.concatenate((test_Y,train_Y[np.where(train_Y==1)]))
		input_dimensions = str(training.shape[1]) #feature length
		samples_size =str(training.shape[0]) #number of rows
		input_dimensions_test = str(testing.shape[1] )#feature length
		samples_size_test = str(testing.shape[0]) #number of rows
		#####################################################################
		# ONE CLASS SVM
		#####################################################################
		print()
		print('One Class SVM') # healthy data to train only
		print()

		output.write("\n===== ONE CLASS SVM =====\n")
		output.write("===== DATA INFORMATION FOR THIS METHOD 	=====\n")
		output.write('training data size: ' +samples_size +' by '+ input_dimensions+'\n')
		output.write('test data size: '  +samples_size_test +' by '+ input_dimensions_test+'\n')
		output.write('training set is all healthy data, testing set contains other data and all failed points\n')

		clf = svm.OneClassSVM(nu=0.15, kernel='rbf', gamma=0.75) # nu=0.15
		clf.fit(training)
		with open(filepath+'svm_one_class.pickle','wb') as f:
			pickle.dump(clf,f)
		y_pred_train = clf.predict(training)
		y_pred_test = clf.predict(testing)
		anomaly_detection_error(y_pred_train, train_Y[train_Y==0], "training", output, filepath+'OneClassSVM', OneClassSVMMethod=True)
		anomaly_detection_error(y_pred_test, testing_Y, "testing", output, filepath+'OneClassSVM', OneClassSVMMethod=True)

		#####################################################################
		# ISOLATION FOREST
		#####################################################################
		print()
		print('IsolationForest')
		print()

		output.write("\n===== ISOLATION FOREST =====\n")

		# Example settings
		n_samples = 100
		samples_max = 0.7336951612320737
		contamination_fraction = 0.11294048783176784

		clf = IsolationForest(n_estimators=n_samples,
								max_samples=samples_max,
								contamination=contamination_fraction,
								random_state=0)
		clf.fit(train_X)
		with open(filepath+'IsolationForest.pickle','wb') as f:
			pickle.dump(clf,f)
		y_pred_train = clf.predict(train_X)
		y_pred_test = clf.predict(test_X)
		anomaly_detection_error(y_pred_train, train_Y, "training", output, filepath+'Isolation Forest')
		anomaly_detection_error(y_pred_test, test_Y, "testing", output, filepath+'Isolation Forest')
					
		#####################################################################
		# ELLIPTIC ENVELOPE
		#####################################################################
		print()
		print('Elliptic Envelope')
		print()

		output.write("\n===== ELLIPTIC ENVELOPE =====\n")

		clf = EllipticEnvelope(contamination=0.175, random_state=0)
		clf.fit(train_X)
		with open(filepath+'EllipticEnvelope.pickle','wb') as f:
			pickle.dump(clf,f)
		y_pred_train = clf.predict(train_X)
		y_pred_test = clf.predict(test_X)
		anomaly_detection_error(y_pred_train, train_Y, "training", output, filepath+'EE')
		anomaly_detection_error(y_pred_test, test_Y, "testing", output, filepath+'EE')
		
		#####################################################################
		# LOCAL OUTLIER FACTOR
		#####################################################################
		print()
		print('Local Outlier Factor')
		print()

		output.write("\n=====LOCAL OUTLIER FACTOR =====\n'")

		for i in [100, 150, 200, 500, 1000]:
			clf = LocalOutlierFactor(n_neighbors=i, contamination=0.25)

			y_pred_train = clf.fit_predict(train_X)
			y_pred_test = clf._predict(test_X)
			anomaly_detection_error(y_pred_train, train_Y, "training", output, filepath+'LOF')
			anomaly_detection_error(y_pred_test, test_Y, "testing", output, filepath+'LOF')
			with open('R:\\SMM-Structures\\A1-010391 (Navy IPMS data analytics)\\Technical\\Data\\datafiles\\'+'LOF {} neighbours.pickle'.format(i),'wb') as f:
				pickle.dump(clf,f)
		print()
Example #34
0
def get_monitoring_tools(X, y):
    """
    determine outlier and distance thresholds
    return thresholds, outlier model(s) and source distributions for distances
    NOTE: for classification the outlier detection on y is not needed

    """

    preprocessor = get_preprocessor()
    preprocessor = preprocessor.fit(X)
    X_pp = preprocessor.transform(X)

    xpipe = Pipeline(steps=[(
        'pca',
        PCA(2)), ('clf',
                  EllipticEnvelope(random_state=0, contamination=0.01))])
    xpipe.fit(X_pp)

    bs_samples = 1000
    outliers_X = np.zeros(bs_samples)
    wasserstein_X = np.zeros(bs_samples)
    wasserstein_y = np.zeros(bs_samples)

    for b in range(bs_samples):
        n_samples = int(np.round(0.80 * X.shape[0]))
        subset_indices = np.random.choice(np.arange(X.shape[0]),
                                          n_samples,
                                          replace=True).astype(int)
        y_bs = y[subset_indices]
        X_bs = X_pp[subset_indices, :]

        test1 = xpipe.predict(X_bs)
        wasserstein_X[b] = wasserstein_distance(X_pp.flatten(), X_bs.flatten())
        wasserstein_y[b] = wasserstein_distance(y, y_bs.flatten())
        outliers_X[b] = 100 * (1.0 - (test1[test1 == 1].size / test1.size))

    ## determine thresholds as a function of the confidence intervals
    outliers_X.sort()
    outlier_X_threshold = outliers_X[int(0.975 * bs_samples)] + outliers_X[int(
        0.025 * bs_samples)]

    wasserstein_X.sort()
    wasserstein_X_threshold = wasserstein_X[int(
        0.975 * bs_samples)] + wasserstein_X[int(0.025 * bs_samples)]

    wasserstein_y.sort()
    wasserstein_y_threshold = wasserstein_y[int(
        0.975 * bs_samples)] + wasserstein_y[int(0.025 * bs_samples)]

    to_return = {
        "outlier_X": np.round(outlier_X_threshold, 1),
        "wasserstein_X": np.round(wasserstein_X_threshold, 2),
        "wasserstein_y": np.round(wasserstein_y_threshold, 2),
        "preprocessor": preprocessor,
        "clf_X": xpipe,
        "X_source": X_pp,
        "y_source": y,
        "latest_X": X,
        "latest_y": y
    }
    return (to_return)
Example #35
0
#!/usr/bin/env python
#-*- coding:utf-8 -*-

import numpy as np
from sklearn.covariance import EllipticEnvelope
import matplotlib.pyplot as plt

X1 = np.loadtxt('slocbool.txt')
ee = EllipticEnvelope(support_fraction=1., contamination=0.02)
xx, yy = np.meshgrid(np.linspace(0, 1500000, 542), np.linspace(0, 15000, 542))
ee.fit(X1)
Z = ee.decision_function(np.c_[xx.ravel(), yy.ravel()])
Z = Z.reshape(xx.shape)

plt.figure(1)
plt.title("Outlier detection: SLOC vs BOOL")
plt.scatter(X1[:, 0], X1[:, 1], color='black')
plt.contour(xx, yy, Z, levels=[0], linewidths=2, colors='m')
plt.ylabel("count of boolean expressions")
plt.xlabel("count of source lines of code")
plt.show()
Example #36
0
def detect(file_path, space, deleted_features):
    """
    Detect outliers
    """
    start_time = time.time()
    print("==================================================")
    print("Outlier detection and treatment started ...")
    print("Space:", space)

    X = pd.read_csv(file_path)

    if len(deleted_features) > 0:
        X = X.drop(deleted_features, axis=1, inplace=False)

    # Basic data cleaning
    X = data_cleaning_formatting(X)

    y_predicted = None
    params = space['params']
    error = dict()

    try:
        if space['model'] == "DBSCAN":
            model = DBSCAN(**params)
            y_predicted = model.fit_predict(X)
            y_predicted = list(map(lambda x: 1 if x < 0 else 0, y_predicted))

        elif space['model'] == "OPTICS":
            model = OPTICS(**params)
            y_predicted = model.fit_predict(X)
            print(y_predicted)
            y_predicted = list(map(lambda x: 1 if x < 0 else 0, y_predicted))

        elif space['model'] == "EllipticEnvelope":
            model = EllipticEnvelope(**params)
            y_predicted = model.fit_predict(X)
            y_predicted = list(map(lambda x: 1 if x == -1 else 0, y_predicted))

        elif space['model'] == "IsolationForest":
            model = IsolationForest(**params)
            with parallel_backend('threading'):
                y_predicted = model.fit_predict(X)
            y_predicted = list(map(lambda x: 1 if x == -1 else 0, y_predicted))

        elif space['model'] == "OneClassSVM":
            model = OneClassSVM(**params)
            y_predicted = model.fit_predict(X)
            y_predicted = list(map(lambda x: 1 if x == -1 else 0, y_predicted))

        elif space['model'] == "LocalOutlierFactor":
            model = LocalOutlierFactor(**params)
            with parallel_backend('threading'):
                y_predicted = model.fit_predict(X)
            y_predicted = list(map(lambda x: 1 if x == -1 else 0, y_predicted))

        elif space['model'] == "zscore":
            model = ZScore(threshold=params['threshold'])
            y_predicted = model.fit_predict(X)

    except Exception as e:
        print("Error:", e)
        y_predicted = [0] * X.shape[0]
        error['detect_' + str(space)] = e

    if isinstance(y_predicted, list):
        y_predicted = np.array(y_predicted)

    time_taken = time.time() - start_time
    print("Time taken:", time_taken)

    return y_predicted
# One-Class SVM for real dataset

import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import load_boston
from sklearn.covariance import EllipticEnvelope
from scipy import stats

# Get the data
dataset = load_boston()
data = dataset["data"][:, [8, 10]]  # Two cluster data
contamination = 0.261

# Fit the model
clf = EllipticEnvelope(contamination=contamination)
clf.fit(data)

# Perform outlier detection
predicted_data = clf.predict(data)
inlier_predicted_data = data[predicted_data == 1]
outlier_predicted_data = data[predicted_data == -1]
num_inliers_predicted = inlier_predicted_data.shape[0]
num_outliers_predicted = outlier_predicted_data.shape[0]

# Plot decision function values
xr = np.linspace(-5, 30, 500)
yr = np.linspace(5, 30, 500)
xx, yy = np.meshgrid(xr, yr)
zz = clf.decision_function(np.c_[xx.ravel(), yy.ravel()])
zz = zz.reshape(xx.shape)
scores = clf.decision_function(data)
    def online_anomaly_detection(self, data_seasnCorec, baseline_window,
                                 sliding_window, outliers_fraction):
        if (mode == 'daily'):
            for i in range(baseline_window, len(data_seasnCorec)):
                data_train_w = data_seasnCorec[i - baseline_window:i]
                # train data normalization ------------------------------------------------------
                data_train_w += 0.1
                standardizer = StandardScaler().fit(data_train_w.values)
                data_train_scaled = standardizer.transform(data_train_w.values)
                data_train_scaled_features = pd.DataFrame(
                    data_train_scaled,
                    index=data_train_w.index,
                    columns=data_train_w.columns)
                data = pd.DataFrame(data_train_scaled_features)
                data_1 = pd.DataFrame(data).fillna(0)
                data_1['steps'] = '0'
                data_1['steps_window_12'] = (data_1['steps'])
                data_train_w = data_1
                data_train.append(data_train_w)

                data_test_w = data_seasnCorec[i:i + sliding_window]
                # test data normalization ------------------------------------------------------
                data_test_w += 0.1
                data_test_scaled = standardizer.transform(data_test_w.values)
                data_scaled_features = pd.DataFrame(
                    data_test_scaled,
                    index=data_test_w.index,
                    columns=data_test_w.columns)
                data = pd.DataFrame(data_scaled_features)
                data_1 = pd.DataFrame(data).fillna(0)
                data_1['steps'] = '0'
                data_1['steps_window_12'] = (data_1['steps'])
                data_test_w = data_1
                data_test.append(data_test_w)

                # fit the model  ------------------------------------------------------
                model = EllipticEnvelope(
                    random_state=RANDOM_SEED,
                    contamination=outliers_fraction,
                    support_fraction=0.7).fit(data_train_w)
                # predict the test set
                preds = model.predict(data_test_w)
                #preds = preds.rename(lambda x: 'anomaly' if x == 0 else x, axis=1)
                dfs.append(preds)
        else:
            for i in range(baseline_window, len(data_seasnCorec)):
                if ((i - baseline_window) // 24 % 7 == 0):
                    recent_index = i
                    data_train_w = data_seasnCorec[i - baseline_window:i]
                    data_train_w += 0.1
                    standardizer = StandardScaler().fit(data_train_w.values)
                    data_train_scaled = standardizer.transform(
                        data_train_w.values)
                    data_train_scaled_features = pd.DataFrame(
                        data_train_scaled,
                        index=data_train_w.index,
                        columns=data_train_w.columns)
                    data = pd.DataFrame(data_train_scaled_features)
                    data_1 = pd.DataFrame(data).fillna(0)
                    data_1['steps'] = '0'
                    data_1['steps_window_12'] = (data_1['steps'])
                    data_train_w = data_1
                    data_train.append(data_train_w)

                else:
                    data_train_w = data_seasnCorec[
                        recent_index - baseline_window:recent_index]
                    data_train_w += 0.1
                    standardizer = StandardScaler().fit(data_train_w.values)
                    data_train_scaled = standardizer.transform(
                        data_train_w.values)
                    data_train_scaled_features = pd.DataFrame(
                        data_train_scaled,
                        index=data_train_w.index,
                        columns=data_train_w.columns)
                    data = pd.DataFrame(data_train_scaled_features)
                    data_1 = pd.DataFrame(data).fillna(0)
                    data_1['steps'] = '0'
                    data_1['steps_window_12'] = (data_1['steps'])
                    data_train_w = data_1
                    data_train.append(data_train_w)

                data_test_w = data_seasnCorec[i:i + sliding_window]
                # test data normalization ------------------------------------------------------
                data_test_w += 0.1
                data_test_scaled = standardizer.transform(data_test_w.values)
                data_scaled_features = pd.DataFrame(
                    data_test_scaled,
                    index=data_test_w.index,
                    columns=data_test_w.columns)
                data = pd.DataFrame(data_scaled_features)
                data_1 = pd.DataFrame(data).fillna(0)
                data_1['steps'] = '0'
                data_1['steps_window_12'] = (data_1['steps'])
                data_test_w = data_1
                data_test.append(data_test_w)

                model = EllipticEnvelope(
                    random_state=RANDOM_SEED,
                    contamination=outliers_fraction,
                    support_fraction=0.7).fit(data_train_w)
                # predict the test set
                preds = model.predict(data_test_w)
                #preds = preds.rename(lambda x: 'anomaly' if x == 0 else x, axis=1)
                dfs.append(preds)
Example #39
0
import numpy as np
from sklearn.covariance import EllipticEnvelope
from sklearn.svm import OneClassSVM
import matplotlib.pyplot as plt
import matplotlib.font_manager
from sklearn.datasets import load_boston

# Get data
X1 = load_boston()['data'][:, [8, 10]]  # two clusters
X2 = load_boston()['data'][:, [5, 12]]  # "banana"-shaped

# Define "classifiers" to be used
classifiers = {
    "Empirical Covariance":
    EllipticEnvelope(support_fraction=1., contamination=0.261),
    "Robust Covariance (Minimum Covariance Determinant)":
    EllipticEnvelope(contamination=0.261),
    "OCSVM":
    OneClassSVM(nu=0.261, gamma=0.05)
}
colors = ['m', 'g', 'b']
legend1 = {}
legend2 = {}

# Learn a frontier for outlier detection with several classifiers
xx1, yy1 = np.meshgrid(np.linspace(-8, 28, 500), np.linspace(3, 40, 500))
xx2, yy2 = np.meshgrid(np.linspace(3, 10, 500), np.linspace(-5, 45, 500))
for i, (clf_name, clf) in enumerate(classifiers.items()):
    plt.figure(1)
    clf.fit(X1)
Example #40
0
def get_update_annots(ls_in):

    f_idx, file, a_usr, b_sys, annots_usr, annots_sys, data_usr, data_sys, num_feat_per_person, pad_noise_bool, n_pre = ls_in

    data_pts = []
    file_length = 0
    # framelen = 0.05
    framelen = 5
    pred_len_max = 20
    # turn_batch = 4

    # get covariance matrix
    backup_cov = np.array(json.load(open('./tools/mean_cov.json', 'rb')))
    # silence bools are for when booth speakers are silent
    silence_bools = np.where(annots_usr['prev_gap_silence_bools'])[0][1:]
    ipu_strts, ipu_ends = convert_to_ms_int_floor(
        annots_usr['ipu_start_times']), convert_to_ms_int_floor(
            annots_usr['ipu_end_times'])
    sil_strts, sil_ends = ipu_ends[silence_bools - 1], ipu_strts[silence_bools]

    try:
        na = np.where(sil_ends > sil_strts)[0]
        sil_strts, sil_ends = sil_strts[na], sil_ends[na]
        na = np.where(sil_strts[1:] >= sil_ends[:-1])[0]
        sil_strts[1:], sil_ends[1:] = sil_strts[na + 1], sil_ends[na + 1]
    except:
        print('error at file: ' + file)

    # assert all(sil_ends > sil_strts)
    # assert all(sil_strts[1:] > sil_ends[:-1])

    # use this to estimate the covariance matrix of the OPPOSITE person
    # we do this because there is less of a chance that the other person will be making
    # noises such as lip smacks etc.. (prob better reasons)
    ls = [
        np.arange(s, e)
        for s, e in zip(np.rint(sil_strts /
                                framelen), np.rint(sil_ends / framelen))
        if e > s
    ]
    if len(ls):
        l = np.concatenate(ls)
        silences = data_sys[l.astype(np.int)]
    else:
        print('bad file')
        print(file)
        # pad with zeros  instead:
        silences = np.zeros([3, data_sys.shape[1]])

    # old covariance estimation
    # self.sil_cov_matrices[file][b_sys] = np.cov(silences, rowvar=False)
    # self.sil_means[file][b_sys] = np.mean(silences, 0)

    # if padding test_seqs with noise, estimate elliptic covariance matrix to avoid outliers
    sil_means = np.mean(silences, 0)

    if pad_noise_bool:
        try:
            cov = EllipticEnvelope().fit(silences - sil_means)
            cov = cov.covariance_
        except:
            cov = backup_cov
    else:
        cov = []
    # get va annotations
    max_time = int(
        np.rint(
            max([
                convert_to_ms_int_floor(annots_usr['end_time_words'][-1]),
                convert_to_ms_int_floor(annots_sys['end_time_words'][-1])
            ]) / framelen))

    def get_va_annots(annots, max_time):
        va_annots = np.zeros(max_time, dtype=np.int16)
        for wrd_strt, wrd_end in zip(
                convert_to_ms_int_floor(annots['start_time_words']),
                convert_to_ms_int_floor(annots['end_time_words'])):
            wrd_strt_f = int(np.rint(wrd_strt / framelen))
            wrd_end_f = int(np.rint(wrd_end / framelen))
            # (maybe) need to add plus 1 because of floor operator
            va_annots[wrd_strt_f:wrd_end_f] = 1
        return va_annots

    va_annots_usr = get_va_annots(annots_usr, max_time)
    va_annots_sys = get_va_annots(annots_sys, max_time)

    # pad with extra values for predictions
    va_annots_usr = np.concatenate(
        [va_annots_usr,
         np.zeros(pred_len_max + 1, dtype=np.int16)])
    va_annots_sys = np.concatenate(
        [va_annots_sys,
         np.zeros(pred_len_max + 1, dtype=np.int16)])
    # hs_annots_sys = get_hs_annots(annots_sys, va_annots_usr, max_time)

    sys_update_start_frames = np.rint(
        convert_to_ms_int_floor(
            annots_usr['updates']['sys_update_strt_times'])[:-1] /
        framelen).astype(np.int32)
    sys_update_end_frames = np.rint(
        convert_to_ms_int_floor(
            annots_usr['updates']['sys_update_end_times'])[:-1] /
        framelen).astype(np.int32)

    # num_updates = len(annots_usr['updates']['sys_update_turns']) - 1 # ommit last update
    usr_updates, sys_updates, sys_turns = [], [], []
    update_batch_list = []
    for update_idx, (strt_fidx_update, end_fidx_update) in enumerate(
            zip(sys_update_start_frames, sys_update_end_frames)):  # update_idx
        strt_t_update = convert_to_ms_int_floor(
            annots_usr['updates']['sys_update_strt_times'][update_idx])
        end_t_update = convert_to_ms_int_floor(
            annots_usr['updates']['sys_update_end_times'][update_idx])

        if strt_fidx_update == end_fidx_update:
            print('Update is zero length')
            pdb.set_trace()
            strt_fidx_update = strt_fidx_update - 1

        # Get associated turns for the user
        usr_turn_words_start_time_ms_int = convert_to_ms_int_floor(
            annots_usr['turn_words_start_time'])
        usr_turn_words_end_time_ms_int = convert_to_ms_int_floor(
            annots_usr['turn_words_end_time'])
        usr_update_turns = \
            (usr_turn_words_start_time_ms_int < strt_t_update) & (usr_turn_words_end_time_ms_int >= strt_t_update) | \
            (usr_turn_words_start_time_ms_int >= strt_t_update) & (usr_turn_words_start_time_ms_int < end_t_update) | \
            (usr_turn_words_start_time_ms_int < end_t_update) & (
                usr_turn_words_end_time_ms_int >= end_t_update)
        usr_update_turns = np.where(usr_update_turns)[0]
        usr_update_turn_starts_t = annots_usr['turn_words_start_time'][
            usr_update_turns]
        usr_update_turn_ends_t = annots_usr['turn_words_end_time'][
            usr_update_turns]
        sys_update_turn_start_t = annots_sys['turn_words_start_time'][
            update_idx]
        sys_update_turn_end_t = annots_sys['turn_words_end_time'][update_idx]
        sys_turn_enc_strt_f = int(
            np.rint(
                convert_to_ms_int_floor(sys_update_turn_start_t) / framelen))
        sys_turn_enc_end_f = int(
            np.rint(convert_to_ms_int_floor(sys_update_turn_end_t) / framelen))
        assert convert_to_ms_int_floor(sys_update_turn_end_t) == end_t_update
        sys_turn_full_over = annots_sys['turn_full_overlap'][update_idx]
        # If system turn is not in full overlap, get the user turn that it is associated with and the offset

        if not (sys_turn_full_over or update_idx == 0):
            # Associated user turn is the user turn that began directly before the system turn
            associated_usr_turn = usr_update_turns[np.where(
                convert_to_ms_int_floor(usr_update_turn_starts_t) <
                convert_to_ms_int_floor(sys_update_turn_start_t))[0][-1]]
            # hack. Also, catch other overlaps that aren't in sys_turn_full_over
            if annots_usr['turn_words_end_time'][
                    associated_usr_turn] > sys_update_turn_end_t:
                associated_usr_turn = -1
        else:
            associated_usr_turn = -1

        # Get associated IPUs for the user
        usr_ipu_start_time_ms_int = convert_to_ms_int_floor(
            annots_usr['ipu_start_times'])
        usr_ipu_end_time_ms_int = convert_to_ms_int_floor(
            annots_usr['ipu_end_times'])
        usr_update_ipus = \
            (usr_ipu_start_time_ms_int < strt_t_update) & (usr_ipu_end_time_ms_int >= strt_t_update) | \
            (usr_ipu_start_time_ms_int >= strt_t_update) & (usr_ipu_start_time_ms_int < end_t_update) | \
            (usr_ipu_start_time_ms_int < end_t_update) & (
                usr_ipu_end_time_ms_int >= end_t_update)
        usr_update_ipus = np.where(usr_update_ipus)[0]
        usr_update_ipus_starts_t = annots_usr['ipu_start_times'][
            usr_update_ipus]
        usr_update_ipus_ends_t = annots_usr['ipu_end_times'][usr_update_ipus]

        if update_idx == 0:
            associated_usr_ipu = -1
            associated_usr_ipu_strt_t = 0
            associated_usr_ipu_end_t = -1
            associated_usr_ipu_strt_f = 0
            associated_usr_ipu_end_f = -1
        # If system turn is not in full overlap, get the user turn that it is associated with and the offset
        elif not (associated_usr_turn == -1):
            # Associated user IPU is the user ipu that began directly before the system turn
            associated_usr_ipu = usr_update_ipus[np.where(
                convert_to_ms_int_floor(usr_update_ipus_starts_t) <
                convert_to_ms_int_floor(sys_update_turn_start_t))[0][-1]]
            associated_usr_ipu_strt_t = annots_usr['ipu_start_times'][
                associated_usr_ipu]
            associated_usr_ipu_end_t = annots_usr['ipu_end_times'][
                associated_usr_ipu]
            associated_usr_ipu_strt_f = int(
                np.rint(
                    convert_to_ms_int_floor(associated_usr_ipu_strt_t) /
                    framelen))
            associated_usr_ipu_end_f = int(
                np.rint(
                    convert_to_ms_int_floor(associated_usr_ipu_end_t) /
                    framelen))

        else:
            associated_usr_ipu = np.where(
                convert_to_ms_int_floor(annots_usr['ipu_start_times']) <
                convert_to_ms_int_floor(sys_update_turn_start_t))[0][-1]
            associated_usr_ipu_strt_t = annots_usr['ipu_start_times'][
                associated_usr_ipu]
            associated_usr_ipu_end_t = annots_usr['ipu_end_times'][
                associated_usr_ipu]
            associated_usr_ipu_strt_f = int(
                np.rint(
                    convert_to_ms_int_floor(associated_usr_ipu_strt_t) /
                    framelen))
            associated_usr_ipu_end_f = int(
                np.rint(
                    convert_to_ms_int_floor(associated_usr_ipu_end_t) /
                    framelen))
            associated_usr_ipu = -1

        # get updates
        usr_update = data_usr[strt_fidx_update:end_fidx_update]
        sys_update = data_sys[strt_fidx_update:end_fidx_update]

        # continuous voice activity annotations
        cont_pred_vec_usr = va_annots_usr[strt_fidx_update:end_fidx_update +
                                          20]
        cont_pred_vec_sys = va_annots_sys[strt_fidx_update:end_fidx_update +
                                          20]

        # Get system turn for encoder
        sys_enc_feats = data_sys[sys_turn_enc_strt_f:sys_turn_enc_end_f]

        # Get test_seq
        # Find the first switch from silence to speech by the user after the system ground truth start and pad with silence noise.
        sil_indx = 0
        while sil_indx < len(va_annots_usr[sys_turn_enc_strt_f:]) - 1:
            if va_annots_usr[sys_turn_enc_strt_f:][
                    sil_indx] == 0 and va_annots_usr[sys_turn_enc_strt_f:][
                        sil_indx + 1] == 1:
                break
            else:
                sil_indx += 1

        # sil indx is one frame before the last of the silence frames
        sil_indx = sys_turn_enc_strt_f + sil_indx
        if (sil_indx - strt_fidx_update) == 0:
            sil_indx += 1

        test_seq = data_usr[strt_fidx_update:sil_indx]

        try:
            assert test_seq.shape[0] > 0
        except AssertionError:
            print('test seq shape is zero in file: ' + file)

        # Get train Y
        y_UT = np.zeros(len(usr_update), dtype=np.int16)
        # protect against turns that start on first frame of file
        y_train_strt = max([0, sys_turn_enc_strt_f - 1 - strt_fidx_update])
        y_UT[y_train_strt:sys_turn_enc_end_f - strt_fidx_update - 1] = 1

        if not any(y_UT == 1):
            print('bad')
        y_strt_t = sys_update_turn_start_t - \
            annots_usr['updates']['sys_update_strt_times'][update_idx]
        y_end_t = sys_update_turn_end_t - \
            annots_usr['updates']['sys_update_strt_times'][update_idx]
        y_strt_f = sys_turn_enc_strt_f - strt_fidx_update
        y_end_f = sys_turn_enc_end_f - strt_fidx_update
        associated_usr_ipu_strt_f = associated_usr_ipu_strt_f - strt_fidx_update
        associated_usr_ipu_end_f = associated_usr_ipu_end_f - strt_fidx_update

        # Get words
        # Candidate system turn encoding words and update words
        s_i = annots_sys['turn_words_start_indx'][update_idx]
        e_i = annots_sys['turn_words_end_indx'][update_idx] + 1
        sys_enc_words = annots_sys['target_words'][s_i:e_i]
        sys_enc_word_strt_ts = annots_sys['start_time_words'][s_i:e_i]
        sys_enc_word_end_ts = annots_sys['end_time_words'][s_i:e_i]
        sys_update_word_strt_frames = np.rint(
            convert_to_ms_int_floor(sys_enc_word_strt_ts) /
            framelen) - strt_fidx_update
        sys_update_word_end_frames = np.rint(
            convert_to_ms_int_floor(sys_enc_word_end_ts) /
            framelen) - strt_fidx_update
        sys_enc_word_strt_frames = np.rint(
            convert_to_ms_int_floor(sys_enc_word_strt_ts) /
            framelen) - sys_turn_enc_strt_f
        sys_enc_word_end_frames = np.rint(
            convert_to_ms_int_floor(sys_enc_word_end_ts) /
            framelen) - sys_turn_enc_strt_f

        # User update words
        if not len(usr_update_turns):
            s_i, s_e = 0, 0
        else:
            s_i = annots_usr['turn_words_start_indx'][usr_update_turns][0]
            e_i = annots_usr['turn_words_end_indx'][usr_update_turns][-1] + 1
        usr_update_words = annots_usr['target_words'][s_i:e_i]
        usr_update_word_strt_ts = annots_usr['start_time_words'][s_i:e_i]
        usr_update_word_end_ts = annots_usr['end_time_words'][s_i:e_i]
        usr_update_word_strt_frames = np.rint(
            convert_to_ms_int_floor(usr_update_word_strt_ts) /
            framelen) - strt_fidx_update
        usr_update_word_end_frames = np.rint(
            convert_to_ms_int_floor(usr_update_word_end_ts) /
            framelen) - strt_fidx_update

        # test seq words
        usr_end_fs = np.rint(
            convert_to_ms_int_floor(annots_usr['end_time_words']) / framelen)
        test_wrd_indices = np.where((usr_end_fs >= strt_fidx_update)
                                    & (usr_end_fs < sil_indx))[0]
        if not len(test_wrd_indices):
            s_i, e_i = 0, 0
        else:
            s_i, e_i = test_wrd_indices[0], test_wrd_indices[-1] + 1
        test_words = annots_usr['target_words'][s_i:e_i]
        test_word_strt_ts = annots_usr['start_time_words'][s_i:e_i]
        test_word_end_ts = annots_usr['end_time_words'][s_i:e_i]
        test_word_strt_frames = np.rint(
            convert_to_ms_int_floor(test_word_strt_ts) /
            framelen) - strt_fidx_update
        test_word_end_frames = np.rint(
            convert_to_ms_int_floor(test_word_end_ts) /
            framelen) - strt_fidx_update

        # dialogue acts for sys encoding
        turn_ipu_start_indx = annots_sys['turn_ipu_start_indx'][update_idx]
        turn_ipu_end_indx = annots_sys['turn_ipu_start_indx'][update_idx + 1]
        # sys_enc_das = annots_sys['da_ISO_second_pass_vec'][turn_ipu_start_indx:turn_ipu_end_indx]
        sys_enc_da_strt_ts = annots_sys['ipu_start_times'][
            turn_ipu_start_indx:turn_ipu_end_indx]
        sys_enc_da_end_ts = annots_sys['ipu_end_times'][
            turn_ipu_start_indx:turn_ipu_end_indx]
        sys_enc_da_strt_frames = np.rint(
            convert_to_ms_int_floor(sys_enc_da_strt_ts) /
            framelen) - sys_turn_enc_strt_f
        sys_enc_da_end_frames = np.rint(
            convert_to_ms_int_floor(sys_enc_da_end_ts) /
            framelen) - sys_turn_enc_strt_f

        word_da_dict = {
            'strt_t_update':
            strt_t_update,
            'end_t_update':
            end_t_update,
            'strt_fidx_update':
            strt_fidx_update,
            'end_fidx_update':
            end_fidx_update,
            'sys_enc_words':
            sys_enc_words,
            'sys_enc_word_strt_ts':
            sys_enc_word_strt_ts,
            'sys_enc_word_end_ts':
            sys_enc_word_end_ts,
            'sys_update_words':
            sys_enc_words,
            'sys_update_word_strt_frames':
            sys_update_word_strt_frames.astype(np.int16),
            'sys_update_word_end_frames':
            sys_update_word_end_frames.astype(np.int16),
            'sys_enc_word_strt_frames':
            sys_enc_word_strt_frames.astype(np.int16),
            'sys_enc_word_end_frames':
            sys_enc_word_end_frames.astype(np.int16),
            'usr_update_words':
            usr_update_words,
            'usr_update_word_strt_ts':
            usr_update_word_strt_ts,
            'usr_update_word_end_ts':
            usr_update_word_end_ts,
            'usr_update_word_strt_frames':
            usr_update_word_strt_frames.astype(np.int16),
            'usr_update_word_end_frames':
            usr_update_word_end_frames.astype(np.int16),
            'test_words':
            test_words,
            'test_word_strt_ts':
            test_word_strt_ts,
            'test_word_end_ts':
            test_word_end_ts,
            'test_word_strt_frames':
            test_word_strt_frames.astype(np.int16),
            'test_word_end_frames':
            test_word_end_frames.astype(np.int16),
            # 'sys_enc_das': sys_enc_das,
            'sys_enc_da_strt_ts':
            sys_enc_da_strt_ts,
            'sys_enc_da_end_ts':
            sys_enc_da_end_ts,
            'sys_enc_da_strt_frames':
            sys_enc_da_strt_frames,
            'sys_enc_da_end_frames':
            sys_enc_da_end_frames
        }

        data_pts.append({
            'y_strt_f': [y_strt_f],
            'y_strt_t': [y_strt_t],
            'y_end_f': [y_end_f],
            'y_end_t': [y_end_t],
            'y_length': [len(sys_enc_feats)],
            'associated_usr_ipu_strt_f': [associated_usr_ipu_strt_f],
            'associated_usr_ipu_end_f': [associated_usr_ipu_end_f],
            'usr_update':
            usr_update,
            'sys_update':
            sys_update,
            'sys_trn': [sys_enc_feats],
            'test_seq':
            test_seq,
            'file': [file],
            'a_usr': [a_usr],
            'update_strt_t':
            [annots_usr['updates']['sys_update_strt_times'][update_idx]],
            'update_end_t':
            [annots_usr['updates']['sys_update_end_times'][update_idx]],
            'update_strt_f': [strt_fidx_update],
            'update_end_f': [end_fidx_update],
            'associated_usr_turn':
            associated_usr_turn,
            'update_idx':
            update_idx,
            'y_UT':
            y_UT,
            'va_usr':
            cont_pred_vec_usr,
            'va_sys':
            cont_pred_vec_sys,
            'word_da_dict':
            word_da_dict
        })
        file_length += 1
    return [data_pts, sil_means, cov, file, a_usr, b_sys, file_length]
Example #41
0
answerIF_proba = pd.DataFrame({'target': answerIF_proba})
pickle.dump(ilf, open("../../data/model/IsolationForest", "wb"))

## Local Outlier Factor

lof = LocalOutlierFactor(n_neighbors=2, novelty=True)
lof.fit(data_scaled_means)
answerLOF_proba = lof.decision_function(data_scaled_means)
answerLOF_proba = 1 - ((answerLOF_proba - answerLOF_proba.min()) /
                       (answerLOF_proba.max() - answerLOF_proba.min()))
answerLOF_proba = pd.DataFrame({'target': answerLOF_proba})
pickle.dump(lof, open("../../data/model/LocalOutlierFactor", "wb"))

## Elliptic Envelope

ee = EllipticEnvelope()
ee.fit(data_scaled_means)
answerEE_proba = ee.decision_function(data_scaled_means)
answerEE_proba = 1 - (answerEE_proba - 3 * answerEE_proba.min()) * 10**12
answerEE_proba = pd.DataFrame({'target': answerEE_proba})
pickle.dump(ee, open("../../data/model/EllipticEnvelope", "wb"))

##############

### Soft voting

voting_answer = pd.DataFrame({
    'target':
    ((answerIF_proba * 2 + answerLOF_proba * 1 + answerEE_proba * 2) /
     5).T.apply(lambda x: -1 if x.values[0] > 0.4 else 1)
})
## 5: Misaligned Gaussian Mixture
blobs_params = dict(random_state=0, n_samples=n_inliers, n_features=3)
X_misaligned = make_blobs(centers=[[-0.7, -0.7, -0.7], [0.7, 0.7, -0.7],
                                   [-0.7, 0.7, 0.7]],
                          cluster_std=[0.2, 0.2, 0.2],
                          **blobs_params)[0]
## 6: Whole dataset
datasets3D = [X_lin, X_hex, X_sph, X_gau, X_misaligned]

# Define to data label
labels = np.concatenate([np.ones(n_inliers), -np.ones(n_outliers)], axis=0)
# lbel 1 as inliers, -1 as outliers

# define outlier/anomaly detection methods to be compared
anomaly_algorithms = [("Robust covariance",
                       EllipticEnvelope(contamination=outliers_fraction)),
                      ("One-Class SVM",
                       svm.OneClassSVM(nu=outliers_fraction,
                                       kernel="rbf",
                                       gamma=0.1)),
                      ("Isolation Forest (IF)",
                       IsolationForest(n_estimators=500,
                                       behaviour='new',
                                       contamination=outliers_fraction,
                                       random_state=42)),
                      ("Local Outlier Factor",
                       LocalOutlierFactor(n_neighbors=35,
                                          contamination=outliers_fraction))]

plt.figure(figsize=(14, 15))
plt.subplots_adjust(left=.02,
def floatrange(start, stop, step):
    steps = math.floor((stop - start) / step)
    temp = []
    for i in range(steps):
        temp.append(start + step * i)
    return temp


'''
离群值检测
'''
plt.figure('Outlier Test', figsize=(9.6, 3.8), dpi=200)
# plt.suptitle('Elliptic Envelope contamination', fontsize=20)
for num, i in enumerate(list([0.01, 0.03, 0.05, 0.1, 0.2, 0.5])):
    cov = EllipticEnvelope(random_state=1, contamination=i)
    cov.fit(np.hstack([X_array, Y_array.reshape(-1, 1)]))
    index = cov.predict(np.hstack([X_array, Y_array.reshape(-1, 1)]))

    X_valid, X_invalid = error_wipe(X_array, index)
    Y_valid, Y_invalid = error_wipe(Y_array, index)

    reg1 = LinearRegression()
    reg1.fit(X_valid, Y_valid)
    reg2 = LinearRegression()
    reg2.fit(X_array, Y_array)
    reg1.rmse = RMSE(Y_valid,reg1.predict(X_valid))
    reg2.rmse = RMSE(Y_array,reg2.predict(X_array))
    print('reg1', reg1.score(X_valid, Y_valid))
    print('reg2', reg2.score(X_array, Y_array))
Example #44
0
import numpy as np
from sklearn.covariance import EllipticEnvelope
from sklearn.datasets import make_blobs

# Создать симулированные данные 
features, _ = make_blobs(n_samples = 10,
                         n_features = 2,
                         centers = 1,
                         random_state = 1)

# Заменить значения первого наблюдения предельными значениями
features[0,0] = 10000
features[0,1] = 10000

# Создать детектор
outlier_detector = EllipticEnvelope(contamination=.1)

# Выполнить подгонку детектора
outlier_detector.fit(features)

# Предсказать выбросы
outlier_detector.predict(features)


# In[6]:


# Создать один признак
feature = features[:,0]

# Создать функцию, которая возращает индекс выбросов
Example #45
0
# SVM
SVM = svm.OneClassSVM(gamma='auto')

detected_results_SVM = SVM.fit_predict(featureData)

outliers_SVM = []
for i in range(len(detected_results_SVM)):
    if detected_results_SVM[i] < 0:
        outliers_SVM.append(i)

SVM_data = np.delete(featureData, (outliers_SVM), axis=0)
output_SVM = np.delete(outputLabels, (outliers_SVM), axis=0)

# Elliptic envelope
EE = EllipticEnvelope()

detected_results_EE = EE.fit_predict(featureData)

outliers_EE = []
for i in range(len(detected_results_EE)):
    if detected_results_EE[i] < 0:
        outliers_EE.append(i)

EE_data = np.delete(featureData, (outliers_EE), axis=0)
output_EE = np.delete(outputLabels, (outliers_EE), axis=0)

# Compare the outlier results
accuracy_scores = {}

model0 = svm.SVC()
Example #46
0
    return dic[x]


train['type_num'] = train['type'].apply(lambda x: to_number(x, column_number))

# 모델에 적용할 데이터 셋 준비
train_x = train.drop(columns=['type', 'type_num'], axis=1)
train_y = train['type_num']
test_x = test

# 상관관계: -0.84 (petroMag_u, psfMag_u), (petroMag_u, fiberMag_u)

# 이상치 감지 객체 만들기
from sklearn.covariance import EllipticEnvelope

outlier_detector = EllipticEnvelope(contamination=.1)

# 감지 객체 훈련
outlier_detector.fit(train_x)

# 이상치 예측
pred = outlier_detector.predict(train_x)
print(pred)
print(pred.shape)

import numpy as np


def find_idx(x):
    return np.where((x < 0))
from sklearn.svm import OneClassSVM as detector
import numpy as np
from utility import Utility
from sklearn.covariance import EllipticEnvelope

#det = detector()
det = EllipticEnvelope()

# X = np.array([1,2,3,4,5,6,7,8,9,10,1000]).reshape(11,1)
#
# det.fit(X)
#
# Y = det.predict(100)
#
# print(Y)

# Find outliers in the interaction rate data

# Step 1 - Convert the dataset into pandas series
util = Utility.SeriesUtility()
datasetFileName = "fans_change_taylor_swift.csv"
series = util.convertDatasetsToSeries(datasetFileName)

series = util.resampleSeriesSum(series, "D")

numberOfPoints = series.data.shape[0]
X = series.values.flatten().reshape(numberOfPoints,1)

det.fit(X)

predicted = det.predict(X)
Example #48
0
def ellepticEnvelopeAnomaly(df, outliersFraction):

    # creation of 4 differents data set based on categories defined before
    df_class0 = df.loc[df['categories'] == 0, 'value']
    df_class1 = df.loc[df['categories'] == 1, 'value']
    df_class2 = df.loc[df['categories'] == 2, 'value']
    df_class3 = df.loc[df['categories'] == 3, 'value']

    # apply ellipticEnvelope(gaussian distribution) at each categories
    envelope = EllipticEnvelope(contamination=outliersFraction)
    X_train = df_class0.values.reshape(-1, 1)
    envelope.fit(X_train)
    df_class0 = pd.DataFrame(df_class0)
    df_class0['deviation'] = envelope.decision_function(X_train)
    df_class0['anomaly'] = envelope.predict(X_train)

    envelope = EllipticEnvelope(contamination=outliersFraction)
    X_train = df_class1.values.reshape(-1, 1)
    envelope.fit(X_train)
    df_class1 = pd.DataFrame(df_class1)
    df_class1['deviation'] = envelope.decision_function(X_train)
    df_class1['anomaly'] = envelope.predict(X_train)

    envelope = EllipticEnvelope(contamination=outliersFraction)
    X_train = df_class2.values.reshape(-1, 1)
    envelope.fit(X_train)
    df_class2 = pd.DataFrame(df_class2)
    df_class2['deviation'] = envelope.decision_function(X_train)
    df_class2['anomaly'] = envelope.predict(X_train)

    envelope = EllipticEnvelope(contamination=outliersFraction)
    X_train = df_class3.values.reshape(-1, 1)
    envelope.fit(X_train)
    df_class3 = pd.DataFrame(df_class3)
    df_class3['deviation'] = envelope.decision_function(X_train)
    df_class3['anomaly'] = envelope.predict(X_train)

    # add the data to the main
    df_class = pd.concat([df_class0, df_class1, df_class2, df_class3])
    df['anomaly22'] = df_class['anomaly']
    df['anomaly22'] = np.array(df['anomaly22'] == -1).astype(int)
    # visualisation of anomaly throughout time (viz 1)
    fig, ax = plt.subplots()
    a = df.loc[df['anomaly22'] == 1, ['time_epoch', 'value']]  #anomaly
    ax.plot(df['time_epoch'], df['value'], color='blue')
    ax.scatter(a['time_epoch'], a['value'], color='red')
    ax.set_title('Elliptic Envelope Multi Clustering')
    plt.show()
    return df
Example #49
0
def plot_raw_overview(filename):
    event_type = 'all'

    if filename.name.startswith('sub-drouwen'):
        CHANS = [f'IH0{x + 1}' for x in range(8)]
    elif filename.name.startswith('sub-itens'):
        CHANS = [f'C0{x + 1}' for x in range(8)]
    elif filename.name.startswith('sub-lemmer'):
        CHANS = [f'IH{x + 1}' for x in range(8)]
    elif filename.name.startswith('sub-som705'):
        CHANS = [f'GA0{x + 1}' for x in range(8)]  # a bit random
    elif filename.name.startswith('sub-ommen'):
        CHANS = ['chan1',
                 'chan2']  # I dont 'understand why I cannot use 'chan64'
    elif filename.name.startswith('sub-vledder') or filename.name.startswith(
            'sub-ommen'):
        CHANS = ['chan1', 'chan64']
    elif '_acq-blackrock_' in filename.name:
        CHANS = ['chan1', 'chan128']
    else:
        print('you need to specify reference channel for this test')
        return None, None

    d = Dataset(filename, bids=True)
    event_names, event_onsets = select_events(d, event_type)

    is_ecog = d.dataset.task.channels.tsv['type'] == 'ECOG'
    is_seeg = d.dataset.task.channels.tsv['type'] == 'SEEG'
    chans = array(d.header['chan_name'])[is_ecog | is_seeg]
    data = d.read_data(begtime=event_onsets[0],
                       endtime=event_onsets[-1],
                       chan=list(chans))
    data.data[0][isnan(data.data[0])] = 0  # ignore nan

    data = montage(data, ref_chan=CHANS)
    freq = frequency(data, taper='hann', duration=2, overlap=0.5)

    hist = make_histogram(data, max=250, step=10)
    divs = []
    fig = plot_hist(hist)
    divs.append(to_div(fig))

    bad_chans = None

    if AUTOMATIC:
        from sklearn.covariance import EllipticEnvelope

        algorithm = EllipticEnvelope(
            contamination=P['data_quality']['histogram']['contamination'])
        prediction = algorithm.fit(hist.data[0]).predict(hist.data[0])
        new_bad_chans = data.chan[0][prediction == -1]
        print('bad channels with histogram / elliptic envelope: ' +
              ', '.join(new_bad_chans))
        bad_chans = set(new_bad_chans)

        fig = plot_outliers(hist.chan[0],
                            algorithm.dist_,
                            prediction,
                            yaxis_title='distance',
                            yaxis_type='log')
        divs.append(to_div(fig))

    fig = plot_freq(freq)
    divs.append(to_div(fig))

    if AUTOMATIC:
        from sklearn.neighbors import LocalOutlierFactor

        algorithm = LocalOutlierFactor(
            n_neighbors=P['data_quality']['spectrum']['n_neighbors'])
        prediction = algorithm.fit_predict(freq.data[0])

        new_bad_chans = data.chan[0][prediction == -1]
        print('bad channels with spectrum / local outlier factor: ' +
              ', '.join(new_bad_chans))
        bad_chans |= set(new_bad_chans)
        fig = plot_outliers(freq.chan[0],
                            algorithm.negative_outlier_factor_,
                            prediction,
                            yaxis_title='distance',
                            yaxis_type='linear')
        divs.append(to_div(fig))

        # we use again the reference channel. Ref channel was handpicked but it might have a weird spectrum
        bad_chans -= set(CHANS)

    return bad_chans, divs
Example #50
0
    def Predict(self):

        if self.ID < 0:
            self.ErrorMessage.setIcon(QMessageBox.Information)
            self.ErrorMessage.setText("Your are not logged in")
            self.ErrorMessage.setWindowTitle("Warning!")
            self.ErrorMessage.exec_()
        elif self.String == self.Accounts[self.ID].AccountPassword:
            y = []
            for i in range(len(self.Accounts)):
                if self.Accounts[self.ID].AccountPassword == self.Accounts[
                        i].AccountPassword:
                    for x in range(len(self.Accounts[i].TrainData)):
                        y.append(self.Accounts[i].AccountName)

            sts = len(list(set(y)))

            self.ProcessData()

            Xset = []
            Yset = []
            sz = len(self.Accounts[self.ID].AccountPassword) * 2

            for j in range(len(self.Accounts[self.ID].TrainData)):
                Xset.append(array(self.Accounts[self.ID].TrainData)[j][sz:])
                Yset.append(1)

            Xset = array(Xset)
            Yset = array(Yset)

            trainx, testx, trainy, testy = train_test_split(Xset,
                                                            Yset,
                                                            test_size=0.3,
                                                            random_state=2)

            trainx = array(trainx)

            X = []
            multiy = []
            multi2y = []

            if sts > 1:

                for i in range(len(self.Accounts)):
                    if self.Accounts[self.ID].AccountPassword == self.Accounts[
                            i].AccountPassword and self.ID != i:
                        hold = []
                        for k in range(len(self.Accounts[i].TrainData)):
                            hold.append(self.Accounts[i].TrainData[k][16:])
                        X = X + hold
                        for x in range(len(self.Accounts[i].TrainData)):
                            multiy.append(-1)
                            multi2y.append(0)
                X = array(X)
                multiy = array(multiy)
                multi2y = array(multi2y)

                testx = np.concatenate((testx, X))
                testymone = np.concatenate((testy, multiy))
                testymzero = np.concatenate((testy, multi2y))

            if sts == 1:
                testymone = testy
                testymzero = testy

            Osvm = OneClassSVM(kernel='rbf', gamma="auto").fit(trainx)
            Ypredict = Osvm.predict(testx)
            score = f1_score(testymone, Ypredict, pos_label=1)

            kmeans = KMeans(n_clusters=2, random_state=0).fit(trainx)
            Ypredict = kmeans.predict(testx)
            score1 = f1_score(testymzero, Ypredict, pos_label=1)

            brc = Birch(n_clusters=2, threshold=0.01).fit(trainx)
            Ypredict = brc.predict(testx)
            score2 = f1_score(testymzero, Ypredict, pos_label=1)

            IsF = IsolationForest(contamination=0.01)
            IsF.fit(trainx)
            Ypredict = IsF.predict(testx)
            score3 = f1_score(testymone, Ypredict, pos_label=1)

            ev = EllipticEnvelope(contamination=0.01)
            ev.fit(trainx)
            Ypredict = ev.predict(testx)
            score4 = f1_score(testymone, Ypredict, pos_label=1)

            if Osvm.predict([self.Dwell + self.Flight]) == 1:
                OsvmResult = 'pass'
            else:
                OsvmResult = 'fail'

            if kmeans.predict([self.Dwell + self.Flight]) == 1:
                kmResult = 'pass'
            else:
                kmResult = 'fail'

            if brc.predict([self.Dwell + self.Flight]) == 1:
                brcResult = 'pass'
            else:
                brcResult = 'fail'

            if IsF.predict([self.Dwell + self.Flight]) == 1:
                IsFResult = 'pass'
            else:
                IsFResult = 'fail'

            if ev.predict([self.Dwell + self.Flight]) == 1:
                evResult = 'pass'
            else:
                evResult = 'fail'

            #print(score,score1,score2,score3,score4)

            self.TrainText.setText("Score/Model" + " \n" +
                                   str(round(score, 2)) + " Osvm: " +
                                   OsvmResult + " \n" + str(round(score1, 2)) +
                                   " Km: " + kmResult + " \n" +
                                   str(round(score2, 2)) + " Brc: " +
                                   brcResult + " \n " + str(round(score3, 2)) +
                                   " ISF: " + IsFResult + " \n" +
                                   str(round(score4, 2)) + " Ev: " + evResult)

            #if sts > 1:
            #    self.CompareText.setText(self.Accounts[self.ID].AccountPassword)
            #    self.Compare()
            #    prediction = self.clf.predict([self.Dwell+self.Flight])
            #    str1 = str(prediction)
            #    self.TrainText.setText(str(prediction))

            self.Reset()

        else:
            self.ErrorMessage.setIcon(QMessageBox.Information)
            self.ErrorMessage.setText("Your password is wrong")
            self.ErrorMessage.setWindowTitle("Warning!")
            self.ErrorMessage.exec_()
                'IsMale',
                'Race-Black',
                'Age',
                'HAART-Naive',
                'HAART-Non-Adherent',
                'HAART-Off',
                'HAART-On',
                'Hepatitis C status (HCV)']
for col in tranfer_cols:
    _, cyto_data[col] = cyto_data.align(pat_data[col], join='left', axis = 0)
cyto_data['HCV'] = cyto_data['Hepatitis C status (HCV)']

# <codecell>

for col in cytos:
    env = EllipticEnvelope(contamination=0.05)
    env.fit(cyto_data[col].dropna().values.reshape(-1, 1))
    mask = env.predict(cyto_data[col].values.reshape(-1,1))
    cyto_data[col][mask==-1] = np.nan

# <codecell>


fig, axs = plt.subplots(11,3, figsize = (10,20))

for ax, col in zip(axs.flatten(), cytos):
    
    boxes = []
    mus = []
    stds = []
    for trop in trops:
Example #52
0
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor

print(__doc__)

matplotlib.rcParams['contour.negative_linestyle'] = 'solid'

# Example settings
n_samples = 300
outliers_fraction = 0.15
n_outliers = int(outliers_fraction * n_samples)
n_inliers = n_samples - n_outliers

# define outlier/anomaly detection methods to be compared
anomaly_algorithms = [
    ("Robust covariance", EllipticEnvelope(contamination=outliers_fraction)),
    ("One-Class SVM", svm.OneClassSVM(nu=outliers_fraction, kernel="rbf",
                                      gamma=0.1)),
    ("Isolation Forest", IsolationForest(behaviour='new',
                                         contamination=outliers_fraction,
                                         random_state=42)),
    ("Local Outlier Factor", LocalOutlierFactor(
        n_neighbors=35, contamination=outliers_fraction))]

# Define datasets
blobs_params = dict(random_state=0, n_samples=n_inliers, n_features=2)
datasets = [
    make_blobs(centers=[[0, 0], [0, 0]], cluster_std=0.5,
               **blobs_params)[0],
    make_blobs(centers=[[2, 2], [-2, -2]], cluster_std=[0.5, 0.5],
               **blobs_params)[0],
Example #53
0
def detect(model, clean_dataset, random_trials=100):
    logger = logging.getLogger(__name__)

    dummy_input_image = np.zeros((1, *clean_dataset.input_shape))

    KLASSES = list(range(clean_dataset.num_classes))

    logger.info('Computing saliency...')
    sms_ = saliency_map_all(model, dummy_input_image)

    sms_model = [np.linalg.norm(s, ord=2, axis=2, keepdims=True) for s in sms_]

    logger.info('Finding outleirs...')

    outs = []

    for sms in sms_model:
        d = sms.reshape(-1, 1)
        env = EllipticEnvelope()
        env.fit(d)
        outliers = env.predict(d).reshape(clean_dataset.input_shape[0],
                                          clean_dataset.input_shape[1], 1)
        outliers[outliers == 1] = 0
        outliers[outliers == -1] = 1
        outs.append(outliers)

    AT_LEAST = ceil(clean_dataset.num_classes/2 + 1)
    recovered = np.stack([s == 1 for s in outs]).sum(axis=0) >= AT_LEAST

    logger.info('Recovering mask...')
    mask = np.repeat(recovered, clean_dataset.input_shape[2], axis=2)

    mask_size = mask.sum()

    mask_prop = (mask_size/(clean_dataset.input_shape[0] *
                            clean_dataset.input_shape[1]))

    logger.info('Mask proportion is %.3f', mask_prop)

    def sample_with_klass(val):
        klass = clean_dataset.x_test[clean_dataset.y_test_cat == val]
        while True:
            idx = np.random.choice(len(klass), size=1)[0]
            sample = klass[idx]
            pred = model.predict_classes(sample[np.newaxis, :])[0]

            if val == pred:
                return sample
            else:
                logger.info('Got misclassified sample, retrying...')

    logger.info('Sampling one observation per class in the clean dataset...')

    sample = np.stack([sample_with_klass(val) for val in KLASSES])
    maker = patch.pattern_maker(mask_size, dynamic=True)
    sample_preds = model.predict_classes(sample)

    logger.info('Predictions are: %s', sample_preds)

    def apply_mask(sample):
        _sample = np.copy(sample)
        _sample[:, mask] = maker()
        return _sample

    perturbed = np.stack([apply_mask(sample) for _ in range(random_trials)])

    def trial(i):
        batch = perturbed[:, i, :]
        batch_preds = model.predict_classes(batch)
        return batch_preds

    res = [trial(i) for i in range(10)]

    return sms_model, outs, recovered, sample, res, mask_prop
Example #54
0
                            covariance_type='full',
                            random_state=random),
    'B-GMM-tied':
    BayesianGaussianMixture(n_components=5,
                            covariance_type='tied',
                            random_state=random),
    'B-GMM-diag':
    BayesianGaussianMixture(n_components=10,
                            covariance_type='diag',
                            random_state=random),
    'B-GMM-spherical':
    BayesianGaussianMixture(n_components=10,
                            covariance_type='spherical',
                            random_state=random),
    'EllipticEnvelope':
    EllipticEnvelope(),
}
DATASETS = {
    #'binary': datasets.make_classification(n_classes=2, n_features=7, n_samples=100, random_state=random),
    #'5way': datasets.make_classification(n_classes=2, n_features=4, n_informative=2, n_samples=6, random_state=random),
    '5way':
    datasets.make_classification(n_classes=5,
                                 n_features=7,
                                 n_informative=5,
                                 n_samples=50,
                                 random_state=random),
}
METHODS = [
    'inline',
    #'pymodule',
    #'loadable',
# -*- coding: utf-8 -*-
import numpy as np
from sklearn.covariance import EllipticEnvelope
from sklearn.svm import OneClassSVM
import matplotlib.pyplot as plt
import matplotlib.font_manager
from sklearn.datasets import load_boston

# Get data
X1 = load_boston()['data'][:, [8, 10]]  # two clusters
X2 = load_boston()['data'][:, [5, 12]]  # "banana"-shaped

# Define "classifiers" to be used
classifiers = {
    "Empirical Covariance": EllipticEnvelope(support_fraction=1.,
                                             contamination=0.261),
    "Robust Covariance (Minimum Covariance Determinant)":
    EllipticEnvelope(contamination=0.261),
    "OCSVM": OneClassSVM(nu=0.261, gamma=0.05)}
colors = ['m', 'g', 'b']
legend1 = {}
legend2 = {}

# Learn a frontier for outlier detection with several classifiers
xx1, yy1 = np.meshgrid(np.linspace(-8, 28, 500), np.linspace(3, 40, 500))
xx2, yy2 = np.meshgrid(np.linspace(3, 10, 500), np.linspace(-5, 45, 500))
for i, (clf_name, clf) in enumerate(classifiers.items()):
    fig1a=plt.figure(1)    
    fig1a.set_size_inches(10, 10)
    clf.fit(X1)
    Z1 = clf.decision_function(np.c_[xx1.ravel(), yy1.ravel()])
def outliers_from_ellipticEnvelope():
    from sklearn.covariance import EllipticEnvelope
    env=EllipticEnvelope()
    env.fit(features_pca)
    outlier_pred=env.decision_function(features_pca).ravel()
    return outlier_pred
Example #57
0
from sklearn.cluster import KMeans
import numpy as np
from sklearn.decomposition import PCA
import matplotlib.pyplot as plt
import csv
from sklearn import svm
from sklearn.covariance import EllipticEnvelope
from scipy import stats

data=[]
with open('newdata.csv', 'rb') as f:
	rdr=csv.reader(f)
	for row in rdr:
		data.append([int(row[1]), int(row[2])])
data=np.array(data)
# print(data)
outliers_fraction = 0.05
# est=svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05,kernel="rbf", gamma=0.1)
est=EllipticEnvelope(contamination=.1)
# est=KMeans(n_clusters=3)
est.fit(data)
# labels=est.labels_
y_pred=est.decision_function(data).ravel()
threshold = stats.scoreatpercentile(y_pred,
                                            100 * outliers_fraction)

labels=[ (2 if y>threshold  else 1) for y in y_pred];
# labels=est.labels_
print(labels)
plt.scatter(data[:,0], data[:,1], c=labels, lw=0)
plt.show()
Example #58
0
print("Size of perturbation: ", len(perturbation_X_test))
print("Size of mixed: ", len(mixed_X_test))

# ============ ALGORITHMS ============
# model1 = LinearRegression()
model2 = IsolationForest(
    n_estimators=200, max_samples=200, contamination=0.1, random_state=100)
# model2 = IsolationForest()
model3 = OneClassSVM(kernel='linear', gamma='auto', nu=0.1)  # fix
# model3 = OneClassSVM(kernel='poly', gamma='scale', nu=0.01)
# model4 = LocalOutlierFactor(
#     n_neighbors=300, metric="euclidean", contamination=0.1)
model4 = LocalOutlierFactor(n_neighbors=200, algorithm="brute",
                            leaf_size=200, contamination=0.1)  # fix
# model5 = DBSCAN()
model6 = EllipticEnvelope(
    contamination=0.10, random_state=100, support_fraction=0.1)  # fix

# model fitting outlier detection
# print("====== OUTLIER DETECTION =======")
X_train_pred2, X_test_pred2 = model2.fit_predict(
    df_X_train), model2.fit_predict(df_X_test)
X_train_pred3, X_test_pred3 = model3.fit_predict(
    df_X_train), model3.fit_predict(df_X_test)
X_train_pred4, X_test_pred4 = model4.fit_predict(
    df_X_train), model4.fit_predict(df_X_test)
# y_pred5 = model5.fit_predict(df)
X_train_pred6, X_test_pred6 = model6.fit_predict(
    df_X_train), model6.fit_predict(df_X_test)


# print("====== NOVELTY DETECTION =======")
Example #59
0
from sklearn import svm
from sklearn.covariance import EllipticEnvelope

# Example settings
n_samples = 200
outliers_fraction = 0.25
clusters_separation = [0, 1, 2]

# define two outlier detection tools to be compared
classifiers = {
    "One-Class SVM":
    svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05,
                    kernel="rbf",
                    gamma=0.1),
    "robust covariance estimator":
    EllipticEnvelope(contamination=.1)
}

# Compare given classifiers under given settings
xx, yy = np.meshgrid(np.linspace(-7, 7, 500), np.linspace(-7, 7, 500))
n_inliers = int((1. - outliers_fraction) * n_samples)
n_outliers = int(outliers_fraction * n_samples)
ground_truth = np.ones(n_samples, dtype=int)
ground_truth[-n_outliers:] = 0

# Fit the problem with varying cluster separation
for i, offset in enumerate(clusters_separation):
    np.random.seed(42)
    # Data generation
    X1 = 0.3 * np.random.randn(0.5 * n_inliers, 2) - offset
    X2 = 0.3 * np.random.randn(0.5 * n_inliers, 2) + offset
# elliptic envelope for imbalanced classification
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.covariance import EllipticEnvelope
# generate dataset
X, y = make_classification(n_samples=10000, n_features=2, n_redundant=0,
	n_clusters_per_class=1, weights=[0.999], flip_y=0, random_state=4)
# split into train/test sets
trainX, testX, trainy, testy = train_test_split(X, y, test_size=0.5, random_state=2, stratify=y)
# define outlier detection model
model = EllipticEnvelope(contamination=0.01)
# fit on majority class
trainX = trainX[trainy==0]
model.fit(trainX)
# detect outliers in the test set
yhat = model.predict(testX)
# mark inliers 1, outliers -1
testy[testy == 1] = -1
testy[testy == 0] = 1
# calculate score
score = f1_score(testy, yhat, pos_label=-1)
print('F-measure: %.3f' % score)