def filter_remove_outlayers(self, flat, minimum_value=0): """ Remove outlayers using ellicptic envelope from scikits learn :param flat: :param minimum_value: :return: """ from sklearn.covariance import EllipticEnvelope flat0 = flat.copy() flat0[np.isnan(flat)] = 0 x,y = np.nonzero(flat0) # print np.prod(flat.shape) # print len(y) z = flat[(x,y)] data = np.asarray([x,y,z]).T clf = EllipticEnvelope(contamination=.1) clf.fit(data) y_pred = clf.decision_function(data) out_inds = y_pred < minimum_value flat[(x[out_inds], y[out_inds])] = np.NaN return flat
def envelop(self): # Make sure you apply pca before using Envelop -- it is very sensitive to the feature dimensions clf_een = EllipticEnvelope(store_precision=True, assume_centered=False, support_fraction=0.25, contamination=0.1, random_state=True) # Fitting the model on reduced dimensionality clf_een.fit(self.gen_tr_data) # Prediction labels pred_gen_ts_labels = clf_een.predict(self.gen_ts_data) pred_imp_ts_labels = clf_een.predict(self.imp_ts_data) act_ts_labels = np.concatenate( (self.get_gen_ts_labels(), self.get_imp_ts_labels())) pred_ts_labels = np.concatenate( (pred_gen_ts_labels, pred_imp_ts_labels)) tn, fp, fn, tp = confusion_matrix(act_ts_labels, pred_ts_labels).ravel() far = fp / (fp + tn) frr = fn / (fn + tp) pr = tp / (tp + fp) return far, frr, pr
def predict_AB(train,test,result,num,sshop): filter_feature_train = ['user_id', 'time_stamp', 'mall_id', 'wifi_infos','wifi_id_signal','shop_id'] filter_feature_test = ['user_id', 'time_stamp', 'mall_id', 'wifi_infos','wifi_id_signal'] train = train.drop(filter_feature_train,axis=1) test = test.drop(filter_feature_test,axis=1) train = train.fillna(-999) test = test.fillna(-999) test = test[list(train.columns)].join(test['row_id']) # # 存储矩阵 # train.to_csv(r'D:\刘帅专用\XGBoost天池\mall_data_train&test\train_%d.csv'% num,index=None) # test.to_csv(r'D:\刘帅专用\XGBoost天池\mall_data_train&test\test_%d.csv' % num, index=None) model = EllipticEnvelope() model.fit(train) test['label'] = model.predict(test.drop(['row_id'],axis=1)) # 标签转化回去 test['shop_id'] = None print('***************************',len(test)) print(len(test[test['label']==1])) print('***************************') test = test[test['label']==1] test['shop_id'][test['label']==1] = sshop #todo r = test[['row_id', 'shop_id']] result = pd.concat([result, r]) result['row_id'] = result['row_id'].astype('int') return result
def calc(self,outliers_fraction): data, dqs, raw = self.get_data() clf = EllipticEnvelope(contamination=outliers_fraction) X = zip(data['Tbandwidth'],data['Tlatency'],data['Tframerate']) clf.fit(X) #data['y_pred'] = clf.decision_function(X).ravel() #data['y_pred'] = clf.decision_function(X).ravel() #threshold = np.percentile(data['y_pred'],100 * outliers_fraction) data['MDist']=clf.mahalanobis(X) #picking "bad" outliers, not good ones outliers = chi2_outliers(data, [.8,.9,.95], 3) #print outliers outliers = [i[i['Tbandwidth']<i['Tlatency']] for i in outliers] #outliers = data[data['y_pred']<threshold] #data['y_pred'] = data['y_pred'] > threshold #outliers = [x[['ticketid','MDist']].merge(raw, how='inner').drop_duplicates() for x in outliers] #print raw #outliers = [raw[raw['ticketid'].isin(j['ticketid'])] for j in outliers] outliers = [k[k['Tframerate']<(k['Tframerate'].mean()+k['Tframerate'].std())] for k in outliers] #making sure we don't remove aberrantly good framrates outliers = [t.sort_values(by='MDist', ascending=False).drop_duplicates().drop(['Tbandwidth','Tlatency','Tframerate'],axis=1) for t in outliers] #dqs = raw[raw['ticketid'].isin(dqs['ticketid'])] #data = data.sort_values('MDist', ascending=False).drop_duplicates() return outliers, dqs, data.sort_values(by='MDist', ascending=False).drop_duplicates().drop(['Tbandwidth','Tlatency','Tframerate'],axis=1)
def clean_series(self, token, discard=5): """ Remove outliers from the ratio series for a token. Args: discard (int): Drop the most outlying X% of the data. Returns: OrderedDict{year: wpm} """ series = self.ratios[token] X = np.array(list(series.values()))[:, np.newaxis] env = EllipticEnvelope() env.fit(X) # Score each data point. y_pred = env.decision_function(X).ravel() # Get the discard threshold. threshold = stats.scoreatpercentile(y_pred, discard) return OrderedDict([ (year, ratio) for (year, ratio), pred in zip(series.items(), y_pred) if pred > threshold ])
def ellipticCurve(dataset): classifier = EllipticEnvelope(contamination=outlierFraction) classifier.fit(dataset) predScore = classifier.decision_function(dataset) pred = classifier.predict(dataset) outlierRows = [i for i in range(len(pred)) if pred[i] == -1] return predScore, outlierRows
def predict_EllipticEnvelope(X, fraction_outlier): xx, yy = get_meshgrid(X) x1, x2 = xx.min(), xx.max() y1, y2 = yy.min(), yy.max() d = (x2 - x2) * 0.1 A = EllipticEnvelope(contamination=fraction_outlier) A.fit(X) Y = A.predict(X) confidence_mat = numpy.array([(A.predict(x.reshape(-1, 2))).astype(int) for x in numpy.c_[xx.flatten(), yy.flatten()]]) grid_confidence = (confidence_mat).reshape((100, 100)) P.plot_contourf(X[Y > 0], X[Y <= 0], xx, yy, grid_confidence, x_range=[x1 - d, x2 + d], y_range=[y1 - d, y2 + d], filename_out='4_pred_EllipticEnvelope_density.png') P.plot_2D_features_multi_Y(X, -Y, x_range=[x1 - d, x2 + d], y_range=[y1 - d, y2 + d], filename_out='4_pred_EllipticEnvelope.png') return
def train_model(): data = json_normalize(retrieve_data()) if data.empty: # Early termination if no data was retrieved print("No data retrieved, terminating script") sys.exit() vib_dist = (data["vibration_max"] - data["vibration_min"]) vec_dist = data["vector_distance"] # SCALE DATA, ISOLATED frame = {'vector_distance': vec_dist, 'vibration_distance': vib_dist} data_2d = pd.DataFrame(frame).dropna() scaler = MinMaxScaler() training_data = scaler.fit_transform(data_2d) # K MEANS CLUSTERING # Initialize the two centroids in minimum and maximum init_cnts = np.array([[0.0, 0.0], [1.0, 1.0]]) clustering = KMeans(n_clusters=2, random_state=42, init=init_cnts) clustering.fit(training_data) # OUTLIER DETECTION outlier = EllipticEnvelope(contamination=0.00075, random_state=42) outlier.fit(training_data) # SAVE MODELS print("Saving scaler") save_model(scaler, "scaler.sav") print("Saving clustering model") save_model(clustering, "kmeans_model.sav") print("Saving outlier model") save_model(outlier, "outlier_model.sav")
def robustcovariance(nparray, contamination): """ The scikit-learn provides an object covariance.EllipticEnvelope that fits a robust covariance estimate to the data, and thus fits an ellipse to the central data points, ignoring points outside the central mode. References: Rousseeuw, P.J., Van Driessen, K. “A fast algorithm for the minimum covariance determinant estimator”. Technometrics 41(3), 212 (1999) """ df = pd.DataFrame(nparray) # Fit the model clf = EllipticEnvelope(contamination=contamination) clf.fit(df) y_pred = clf.predict(df) y_pred[y_pred == 1] = 0 y_pred[y_pred == -1] = 1 # df['RC'] = y_pred # ax = df[df['RC']==1][0].plot(style='.') # df[df['RC']==-1][0].plot(style='.',ax=ax) return y_pred
def show(samplepath): paths = [] sname = os.path.splitext(samplepath)[0] print sname with open(sname+"_path.txt", 'r') as f: for line in f: paths.append(line.strip()) X = load_one_class_feature(samplepath) X = norm_data(X) #clf = OneClassSVM(kernel='rbf',gamma=0.01,nu=0.098) clf = EllipticEnvelope(contamination=0.05) clf.fit(X) Y = clf.predict(X) DY = clf.decision_function(X) for k in range(len(Y)): if Y[k] < 0: #abnormality is positive print k + 1, ',', DY[k], ',',paths[k] err = np.sum( [ y < 0 for y in Y] ) print '%d/%d'%(err, len(Y)) x1,y1 = np.meshgrid(np.linspace(-20,20,400), np.linspace(-20,20,400)) z1 = clf.decision_function(np.c_[x1.ravel(), y1.ravel()]) z1 = z1.reshape(x1.shape) legend = {} legend['test'] = plt.contour(x1,y1,z1, levels=[0], linewidths=2,color='r') plt.scatter(X[:,0], X[:,1], color='black') values_list = list(legend.values()) keys_list = list(legend.keys()) plt.legend([values_list[0].collections[0]],[keys_list[0]]) plt.show()
class EllipticEnvelopeOutlierStream(OutlierStream): def __init__(self, data, data_stream): OutlierStream.__init__(self, data, data_stream) self.model = EllipticEnvelope(contamination=0.045) self.DEBUG = False self.pca_plot = StreamPCA() def train_model(self, data): self.model.fit(data) def update_model(self, data): return None def predict_model(self, data): return self.model.predict(data) def summary(self, predictions, data_stream): print("Non outliers: {}".format(len(list(filter(lambda x: x > 0, predictions))))) print("Outliers: {}".format(len(list(filter(lambda x: x < 0, predictions))))) import numpy as np y_axes = np.linspace(0, len(predictions), len(predictions)) plt.scatter(y_axes,predictions) plt.show()
def outliers_detection(expr): x = PCA(n_components=2).fit_transform(expr) ee = EllipticEnvelope() ee.fit(x) oo = ee.predict(x) return oo
def EllipticEnvelopeDetection(clm_select, all_tss, df_data, plot=False): rng = np.random.RandomState(42) outliers_fraction = 0.6 if plot: plt.figure() ee_pred = {} for i in range(len(clm_select)): col = clm_select[i] j = 1 ee_pred[col] = [] for kind in all_tss[col].keys(): j += 1 X = np.array(all_tss[col][kind]) # ONE-class SVM clf = EllipticEnvelope(contamination=outliers_fraction) clf.fit(X) y_pred = clf.predict(X) ee_pred[col].extend(y_pred) if plot: subplot = plt.subplot(len(clm_select), 1, i + 1) subplot.scatter(df_data['val'], df_data[col], c=ee_pred[col]) subplot.set_title('Dimension ' + clm_select[i]) if plot: plt.suptitle('Outlier detection with one class EllipticEnvelope') plt.show() return ee_pred
def outlier_removal2(features, samples, cv_predict): outliers_fraction = 0.1 print cv_predict.shape print samples.shape test = np.column_stack((cv_predict, samples)) #clf = EllipticEnvelope(contamination=.1) clf = EllipticEnvelope(contamination=.1) #clf = svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05, # kernel="rbf", gamma=0.1) clf.fit(test) y_pred = clf.decision_function(test).ravel() threshold = stats.scoreatpercentile(y_pred, 100 * outliers_fraction) y_pred_new = y_pred > threshold print y_pred_new #print samples[y_pred_new] print samples.shape print samples[y_pred_new].shape print features.shape print features[y_pred_new].shape return features[y_pred_new], samples[y_pred_new]
def cov(self, X_train, contamination=None, random_state=None): """ Train Elliptic Envelope model from scikit-learn Parameters __________ X_train: scaled training data contamination: percentage of anomalies in the data random_state: random number seed Returns ________ Anomaly scores """ model = EllipticEnvelope(contamination=contamination, random_state=random_state) model.fit(X_train) # Predict raw anomaly score labels = model.predict(X_train) # -1 for outliers and 1 for inliers labels = (labels.max() - labels) // 2 # rescaled labels (1: outliers, 0: inliers) cov_anomaly_scores = model.decision_function( X_train) * -1 # anomaly score cov_anomaly_scores = self.min_max_scaler(cov_anomaly_scores) return cov_anomaly_scores, labels
def DetectOutliersUsingEnvelope(self): data = self.__df[[self.__x, self.__y]].values clf = EllipticEnvelope() x_min_value, x_max_value = min( self.__df[self.__x].values) - self.__factor, max( self.__df[self.__x].values) + self.__factor y_min_value, y_max_value = min( self.__df[self.__y].values) - self.__factor, max( self.__df[self.__y].values) + self.__factor xx, yy = np.meshgrid(np.linspace(x_min_value, x_max_value, 500), np.linspace(y_min_value, y_max_value, 500)) clf.fit(data) Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) pred = clf.fit_predict(data) #Outliers = -1, inliers = 1 if self.__drop_outliers: # Let's drop outliers from dataset! for index, outlier in enumerate(pred): if outlier == -1: self.__df = self.__df.drop(index, axis=0) return self.__df else: return xx, yy, Z
def ellipses_indices_of_outliers(X, contamination=0.1): ''' Detects outliers using the elliptical envelope method Input: An array of all variables to detect outliers for Output: An array with indices of detected outliers ''' from sklearn.covariance import EllipticEnvelope # Copying to prevent changes to the input array X = X.copy() # Dropping categorical columns non_categorical = [] for feature in range(X.shape[1]): num_unique_values = len(np.unique(X[:, feature])) if num_unique_values > 30: non_categorical.append(feature) X = X[:, non_categorical] # Subsetting to columns without categorical indexes # Testing if there are an adequate number of features if X.shape[0] < X.shape[1] ** 2.: print('Will not perform well. Reduce the dimensionality and try again.') return # Creating and fitting the detector outlier_detector = EllipticEnvelope(contamination=contamination) outlier_detector.fit(X) # Predicting outliers and outputting an array with 1 if it is an outlier outliers = outlier_detector.predict(X) outlier_indices = np.where(outliers == -1) return outlier_indices
def detect_anomalies(self, data, **params): envelope = EllipticEnvelope() envelope.set_params(**params) envelope.fit(data) # TODO: decision function has other range than that of IsolationForest return envelope.decision_function( data) # The anomaly score. The lower, the more abnormal.
class EllipticEnvelope_Classifier: """docstring for EllipticEnvelope""" def __init__(self, save_path): # 默认路径 # 保存路径 self.save_path = os.path.join(save_path,'EllipticEnvelope') if not os.path.exists(self.save_path): os.makedirs(self.save_path) self.contamination = 0.1 self.classifier = EllipticEnvelope(contamination=self.contamination) def fit_model(self, train_data_matrix, test_data_matrix, test_true_label): """训练模型""" train_data_matrix = train_data_matrix.toarray() test_data_matrix = test_data_matrix.toarray() self.classifier.fit(train_data_matrix) y_pred_label = self.classifier.predict(test_data_matrix) n_errors_test = (y_pred_label!=test_true_label).sum() accuracy, classification_report, confusion_matrix = sklearn_evaluation(test_true_label, y_pred_label) print('Accuracy: {} \nClassification Report:\n{}\n'.format(accuracy, classification_report)) sys.stdout.flush() def test_model(test_data,): """测试模型 such as test_label = [1,1,-1,....] """ scores_pred = self.classifier.decision_function(train_data) y_pred = self.classifier.predict(train_data) n_error_train = y_pred_test[y_pred_test == -1].size
def outlier(TRAIN, contam): for i in range(TRAIN.shape[1]): v = TRAIN[:, i] v_hat = (v - np.median(v)) TRAIN[:, i] = v_hat # model creation clf = EllipticEnvelope(support_fraction=1., contamination=contam, assume_centered=True) clf.fit(TRAIN) C = clf.correct_covariance(TRAIN) pred = clf.predict(TRAIN) # eigen decomposition E, U = LA.eig(C) P = U[0:2, :] X_hat = np.dot(TRAIN, np.transpose(P)) # plotting pred += 1 for i in range(pred.shape[0]): pred[i] = pred[i] // 2 plotting(X_hat, pred) return pred
class EllipticEnvelopeFilter(BaseEstimator): def __init__(self, assume_centered=False, support_fraction=None, contamination=0.1, random_state=None): self.assume_centered = assume_centered self.support_fraction = support_fraction self.contamination = contamination self.random_state = random_state def fit_pipe(self, X, y=None): self.elliptic_envelope_ = EllipticEnvelope(**self.get_params()) self.elliptic_envelope_.fit(X) return self.transform_pipe(X, y) def transform_pipe(self, X, y): # XXX: sample_props not taken care off is_inlier = self.elliptic_envelope_.predict(X) == 1 X_out = X[is_inlier] if y is None: y_out = None else: y_out = y[is_inlier] return X_out, y_out def transform(self, X, y=None): return X
def compare_drift(X_src, y_src, X_new, y_new): clf_y = EllipticEnvelope(random_state=0, contamination=0.01) clf_X = EllipticEnvelope(random_state=0, contamination=0.01) clf_X.fit(X_src) clf_y.fit(y_src.reshape(y_src.size, 1)) test_X = clf_X.predict(X_new) test_y = clf_y.predict(y_new.reshape(-1, 1)) X_distance = wasserstein_distance(X_src.values.flatten(), X_new.values.flatten()) y_distance = wasserstein_distance(y_src.flatten(), y_new.flatten()) X_outlier = len(test_X[test_X == -1]) / len(test_X) y_outlier = len(test_y[test_y == -1]) / len(test_y) results = { 'X_wasserstein_distance': X_distance, 'y_wasserstein_distance': y_distance, 'X_outlier_percentage': X_outlier, 'y_outlier_percentage': y_outlier } return results
class EllipticDetection(BaseEstimator, TransformerMixin): def __init__(self, contamination=0): self.contamination = contamination def fit(self, X, y=None): if self.contamination == 0: return self self.ell = EllipticEnvelope(contamination=self.contamination) if y is None: self.ell.fit(X) else: self.ell.fit(X, y) return self def transform(self, X_): X = deepcopy(X_) if self.contamination == 0: return X idx_outlier = self.ell.predict(X) == -1 X[idx_outlier, :] = np.nan simple_imputer = SimpleImputer() X = simple_imputer.fit_transform(X) return X
class Baseline(ModelBase): def __init__(self, model_name, packet_length=1500, seq_length=1, epochs=1): super().__init__(packet_length, seq_length, epochs) self.model_name = model_name if model_name == 'svm': self.model = OneClassSVM(kernel='rbf', nu=0.05) elif model_name == 'if': self.model = IsolationForest(contamination=0.05, max_features=15, random_state=0) elif model_name == 'lof': self.model = LocalOutlierFactor(contamination=0.05, novelty=True) elif model_name == 'gm': self.model = GaussianMixture(random_state=0) elif model_name == 'ee': self.model = EllipticEnvelope(contamination=0.05, random_state=0) def fit(self, X): self.model.fit(X) def predict(self, X): labels = self.model.predict(X) scores = self.model.score_samples(X) return scores, labels def save(self, name): joblib.dump(self.model, name + '_{}.pkl'.format(self.model_name)) def load(self, name): self.model = joblib.load(name + '_{}.pkl'.format(self.model_name)) def exist(self, name): return os.path.exists(name + '_{}.pkl'.format(self.model_name))
def filter_outliers_in_features(X): # clf = svm.OneClassSVM(nu=0.1, kernel="rbf", gamma=0.1) clf = EllipticEnvelope(support_fraction=1, contamination=0.2) clf.fit(X) # r = clf.predict(X) X = X[clf.predict(X) == 1] return X
def anomaly_detection(X): clf = EllipticEnvelope() clf.fit(X) y_pred = clf.decision_function(X).ravel() percentile = 1.9 threshold = np.percentile(y_pred, percentile) print(threshold) outliers = y_pred < threshold xx, yy = np.meshgrid(np.linspace(0, 25, 200), np.linspace(0, 30, 200)) Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5)) sns.distplot(y_pred, rug=True, ax=ax1) sns.distplot(y_pred[outliers], rug=True, hist=False, kde=False, norm_hist=True, color='r', ax=ax1) ax1.vlines(threshold, 0, 0.9, colors='r', linestyles='dotted', label='Threshold for {} percentile = {}'.format(percentile, np.round(threshold, 2))) ax1.set_title('Distribution of Elliptic Envelope decision function values') ax1.legend(loc='best') ax2.scatter(X[:, 0], X[:, 1], c='b', marker='x') ax2.scatter(X[outliers][:, 0], X[outliers][:, 1], c='r', marker='x', linewidths=2) ax2.contour(xx, yy, Z, levels=[threshold], linewidths=2, colors='red', linestyles='dotted') ax2.set_title("Outlier detection") ax2.set_xlabel('Latency (ms)') ax2.set_ylabel('Throughput (mb/s)') plt.show()
def calcu2(mppt): clf = EllipticEnvelope(contamination=0.01) my_mppt1 = mppt.iloc[:, 0:106] clf.fit(my_mppt1) y_pred = clf.predict(my_mppt1) # y_pred = clf.predict(my_mppt1) output = mppt[y_pred == -1].iloc[:, 108] return output
def view_anomalies(df): data = reindex_data(df) df.index = data.index df_class0 = df.loc[df['srch_saturday_night_bool'] == 0, 'price_usd'] df_class1 = df.loc[df['srch_saturday_night_bool'] == 1, 'price_usd'] fig, axs = plt.subplots(1,2) df_class0.hist(ax=axs[0], bins=30) df_class1.hist(ax=axs[1], bins=30); outliers_fraction = 0.01 envelope = EllipticEnvelope(contamination = outliers_fraction) X_train = df_class0.values.reshape(-1,1) envelope.fit(X_train) df_class0 = pd.DataFrame(df_class0) df_class0['deviation'] = envelope.decision_function(X_train) df_class0['anomaly'] = envelope.predict(X_train) envelope = EllipticEnvelope(contamination = outliers_fraction) X_train = df_class1.values.reshape(-1,1) envelope.fit(X_train) df_class1 = pd.DataFrame(df_class1) df_class1['deviation'] = envelope.decision_function(X_train) df_class1['anomaly'] = envelope.predict(X_train) # plot the price repartition by categories with anomalies a0 = df_class0.loc[df_class0['anomaly'] == 1, 'price_usd'] b0 = df_class0.loc[df_class0['anomaly'] == -1, 'price_usd'] a2 = df_class1.loc[df_class1['anomaly'] == 1, 'price_usd'] b2 = df_class1.loc[df_class1['anomaly'] == -1, 'price_usd'] fig, axs = plt.subplots(1,2) axs[0].hist([a0,b0], bins=32, stacked=True, color=['blue', 'red']) axs[1].hist([a2,b2], bins=32, stacked=True, color=['blue', 'red']) axs[0].set_title("Search Non Saturday Night") axs[1].set_title("Search Saturday Night") df_class = pd.concat([df_class0, df_class1]) df['anomaly5'] = df_class['anomaly'] # df['anomaly5'] = np.array(df['anomaly22'] == -1).astype(int) fig, ax = plt.subplots(figsize=(10, 6)) df = df.sort_values('date_time') df['date_time_int'] = pd.to_datetime(df['date_time']).astype('int64') a = df.loc[df['anomaly5'] == -1, ('date_time_int', 'price_usd')] #anomaly ax.plot(df['date_time_int'], df['price_usd'], color='blue', label='Normal') ax.scatter(a['date_time_int'],a['price_usd'], color='red', label='Anomaly') plt.legend() a = df.loc[df['anomaly5'] == 1, 'price_usd'] b = df.loc[df['anomaly5'] == -1, 'price_usd'] fig, axs = plt.subplots(figsize=(10, 6)) axs.hist([a,b], bins=32, stacked=True, color=['blue', 'red']) plt.show();
def train(featuremethods, trainingdata, classification="anomaly_detection", gamma=0, nu=0.5, features=None): #trainingdata is a list of [listoflines,value] duples. For anomaly detection, value is always 0. if not features: features = [] for bunchoflines in trainingdata: features.append([i(bunchoflines) for i in featuremethods]) means = [ np.array([i[j] for i in features]).mean() for j in range(0, len(featuremethods)) ] stdevs = [ np.array([i[j] for i in features]).std() for j in range(0, len(featuremethods)) ] tempfeatures = copy.deepcopy(features) for bunchoflines in tempfeatures: for feature in range(0, len(bunchoflines)): bunchoflines[feature] -= means[feature] bunchoflines[feature] /= stdevs[feature] parameters = {'gamma': [0, 10], 'nu': [0.1, 0.9]} if classification == "anomaly_detection": svr = svm.OneClassSVM(kernel='rbf', degree=3, coef0=0.0, tol=0.001, shrinking=True, cache_size=200, verbose=False, max_iter=-1, random_state=None, gamma=gamma, nu=nu) elif classification == "elliptic_envelope": svr = EllipticEnvelope() else: svr = svm.SVC(cache_size=200, class_weight=None, coef0=0.0, kernel="rbf", max_iter=-1, probability=False, random_state=None, shrinking=True, tol=0.001, verbose=False) if classification == "anomaly_detection": svr.fit(tempfeatures) elif classification == "elliptic_envelope": return [svr.decision_function(tempfeatures), means, stdevs] else: svr.fit(tempfeatures, [1 for i in trainingdata]) return [svr, means, stdevs]
def elliptic_envelope_out(self, contamination): self.report.append('elliptic_envelope_out') ds = self.training[self.numerical_var] elliptic = EllipticEnvelope(contamination=contamination) elliptic.fit(ds) results = elliptic.predict(ds) outlier_elliptic = pd.Series(results) outlier_elliptic.index = ds.index return outlier_elliptic[outlier_elliptic == -1].index
def plot(X, y): proj = TSNE().fit_transform(X) e = EllipticEnvelope(assume_centered=True, contamination=.25) # Outlier detection e.fit(X) good = np.where(e.predict(X) == 1) X = X[good] y = y[good] scatter(proj, y)
def clean_reviewer_average_radius_with_EllipticEnvelope(reviews): good_points = {} classifier = EllipticEnvelope(contamination=0.005) centers, user_ids = get_all_centers_as_array(reviews) classifier.fit(centers) inlier_indexes = np.where(classifier.predict(centers) != -1) user_ids = np.array(user_ids)[inlier_indexes] for i, user_id in enumerate(user_ids): good_points[user_id] = reviews[user_id] return good_points
def labelValidSkeletons_old(skeletons_file, good_skel_row, fit_contamination=0.05): base_name = getBaseName(skeletons_file) progress_timer = timeCounterStr('') print_flush(base_name + ' Filter Skeletons: Starting...') with pd.HDFStore(skeletons_file, 'r') as table_fid: trajectories_data = table_fid['/trajectories_data'] trajectories_data['is_good_skel'] = trajectories_data['has_skeleton'] if good_skel_row.size > 0: #nothing to do if there are not valid skeletons left. print_flush( base_name + ' Filter Skeletons: Reading features for outlier identification.') #calculate classifier for the outliers nodes4fit = ['/skeleton_length', '/contour_area'] + \ ['/' + name_width_fun(part) for part in worm_partitions] X4fit = nodes2Array(skeletons_file, nodes4fit, good_skel_row) assert not np.any(np.isnan(X4fit)) #%% print_flush( base_name + ' Filter Skeletons: Fitting elliptic envelope. Total time:' + progress_timer.getTimeStr()) #TODO here the is a problem with singular covariance matrices that i need to figure out how to solve clf = EllipticEnvelope(contamination=fit_contamination) clf.fit(X4fit) print_flush(base_name + ' Filter Skeletons: Calculating outliers. Total time:' + progress_timer.getTimeStr()) #calculate outliers using the fitted classifier X = nodes2Array(skeletons_file, nodes4fit) #use all the indexes y_pred = clf.decision_function( X).ravel() #less than zero would be an outlier print_flush( base_name + ' Filter Skeletons: Labeling valid skeletons. Total time:' + progress_timer.getTimeStr()) #labeled rows of valid individual skeletons as GOOD_SKE trajectories_data['is_good_skel'] = (y_pred > 0).astype(np.int) #Save the new is_good_skel column saveModifiedTrajData(skeletons_file, trajectories_data) print_flush(base_name + ' Filter Skeletons: Finished. Total time:' + progress_timer.getTimeStr())
def filterOut(x): x = np.array(x) outliers_fraction=0.05 #clf = svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05, kernel="rbf", gamma=0.1) clf = EllipticEnvelope(contamination=outliers_fraction) clf.fit(x) y_pred = clf.decision_function(x).ravel() threshold = stats.scoreatpercentile(y_pred, 100 * outliers_fraction) y_pred = y_pred > threshold return y_pred
def test_outlier_detection(): """ """ rnd = np.random.RandomState(0) X = rnd.randn(100, 10) clf = EllipticEnvelope(contamination=0.1) clf.fit(X) y_pred = clf.predict(X) assert_array_almost_equal(clf.decision_function(X, raw_mahalanobis=True), clf.mahalanobis(X - clf.location_)) assert_almost_equal(clf.score(X, np.ones(100)), (100 - y_pred[y_pred == -1].size) / 100.0)
def module4(self): ''' 入力された一次元配列からanomaly detectionを用いて外れ値を検出する ''' # get data img = cv2.imread('../saliency_detection/image/pearl.png') b,g,r = cv2.split(img) B,G,R = map(lambda x,y,z: x*1. - (y*1. + z*1.)/2., [b,g,r],[r,r,g],[g,b,b]) Y = (r*1. + g*1.)/2. - np.abs(r*1. - g*1.)/2. - b*1. # 負の部分は0にする R[R<0] = 0 G[G<0] = 0 B[B<0] = 0 Y[Y<0] = 0 rg = cv2.absdiff(R,G) by = cv2.absdiff(B,Y) img1 = rg img2 = by rg, by = map(lambda x:x.reshape((len(b[0])*len(b[:,0]),1)),[rg,by]) data = np.hstack((rg,by)) data = data.astype(np.float64) data = np.delete(data, range( 0,len(data[:,0]),2),0) # grid xx1, yy1 = np.meshgrid(np.linspace(-10, 300, 500), np.linspace(-10, 300, 500)) # 学習して境界を求める # contamination大きくすると円は小さく clf = EllipticEnvelope(support_fraction=1, contamination=0.01) print 'data.shape =>',data.shape print 'learning...' clf.fit(data) #学習 # 0があるとだめっぽいかも print 'complete learning!' # 学習した分類器に基づいてデータを分類して楕円を描画 z1 = clf.decision_function(np.c_[xx1.ravel(), yy1.ravel()]) z1 = z1.reshape(xx1.shape) plt.contour(xx1,yy1,z1,levels=[0],linewidths=2,colors='r') # plot plt.scatter(data[:,0],data[:,1],color= 'black') plt.title("Outlier detection") plt.xlim((xx1.min(), xx1.max())) plt.ylim((yy1.min(), yy1.max())) plt.pause(.001) # plt.show() cv2.imshow('rg',img1/np.amax(img1)) cv2.imshow('by',img2/np.amax(img2))
def ellipticenvelope(data, fraction = 0.02): elenv = EllipticEnvelope(contamination=fraction) elenv.fit(data) score = elenv.predict(data) numeration = [[i] for i in xrange(1, len(data)+1, 1)] numeration = np.array(numeration) y = np.hstack((numeration, score)) anomalies = numeration for num,s in y: if (y == 1): y = np.delete(anomalies, num-1, axis=0) return anomalies
def test_outlier_detection(): rnd = np.random.RandomState(0) X = rnd.randn(100, 10) clf = EllipticEnvelope(contamination=0.1) assert_raises(NotFittedError, clf.predict, X) assert_raises(NotFittedError, clf.decision_function, X) clf.fit(X) y_pred = clf.predict(X) decision = clf.decision_function(X, raw_values=True) decision_transformed = clf.decision_function(X, raw_values=False) assert_array_almost_equal(decision, clf.mahalanobis(X)) assert_array_almost_equal(clf.mahalanobis(X), clf.dist_) assert_almost_equal(clf.score(X, np.ones(100)), (100 - y_pred[y_pred == -1].size) / 100.0) assert sum(y_pred == -1) == sum(decision_transformed < 0)
def labelValidSkeletons(skel_file, valid_index, trajectories_data, fit_contamination = 0.05): #calculate valid widths if they were not used calculate_widths(skel_file) #calculate classifier for the outliers X4fit = nodes2Array(skel_file, valid_index) clf = EllipticEnvelope(contamination = fit_contamination) clf.fit(X4fit) #calculate outliers using the fitted classifier X = nodes2Array(skel_file) #use all the indexes y_pred = clf.decision_function(X).ravel() #less than zero would be an outlier #labeled rows of valid individual skeletons as GOOD_SKE trajectories_data['auto_label'] = ((y_pred>0).astype(np.int))*wlab['GOOD_SKE'] #+ wlab['BAD']*np.isnan(y_prev) saveLabelData(skel_file, trajectories_data)
def test_elliptic_envelope(): rnd = np.random.RandomState(0) X = rnd.randn(100, 10) clf = EllipticEnvelope(contamination=0.1) assert_raises(NotFittedError, clf.predict, X) assert_raises(NotFittedError, clf.decision_function, X) clf.fit(X) y_pred = clf.predict(X) scores = clf.score_samples(X) decisions = clf.decision_function(X) assert_array_almost_equal( scores, -clf.mahalanobis(X)) assert_array_almost_equal(clf.mahalanobis(X), clf.dist_) assert_almost_equal(clf.score(X, np.ones(100)), (100 - y_pred[y_pred == -1].size) / 100.) assert(sum(y_pred == -1) == sum(decisions < 0))
def model_2_determine_test_data_similarity(self,model): clf_EE={} model_EE={} for i in range(len(model)): clf=EllipticEnvelope(contamination=0.01,support_fraction=1) clf_EE[i]=clf EEmodel=clf.fit(model[i]) model_EE[i]=EEmodel return clf_EE,model_EE
def labelValidSkeletons(skel_file): calculate_widths(skel_file) #get valid rows using the trajectory displacement and the skeletonization success valid_index, trajectories_data = getValidIndexes(skel_file) #calculate classifier for the outliers X4fit = nodes2Array(skel_file, valid_index) clf = EllipticEnvelope(contamination=.1) clf.fit(X4fit) #calculate outliers using the fitted classifier X = nodes2Array(skel_file) y_pred = clf.decision_function(X).ravel() #less than zero would be an outlier #labeled rows of valid individual skeletons as GOOD_SKE trajectories_data['auto_label'] = ((y_pred>0).astype(np.int))*wlab['GOOD_SKE'] #+ wlab['BAD']*np.isnan(y_prev) saveLabelData(skel_file, trajectories_data)
def anomaly_detection(features, labels): # In this function, I try to use anomaly detection method (using mutivariate gaussian distribution) to identify poi-s non_pois = features[labels==0] pois = features[labels==1] print "non poi size", non_pois.shape, pois.shape, features.shape ## Spliting data to train, test and cross validation set for anomaly detection split1 = produce_spliting_array(non_pois.shape[0], .75 ) X_train = non_pois[split1==1] X_intermediate = non_pois[split1==0] print "size intermediate", X_intermediate.shape split2 = produce_spliting_array(X_intermediate.shape[0], .5 ) X_test = X_intermediate[split2==1] label_test = np.zeros((X_test.shape[0],), dtype=np.int) - 1 X_cv = X_intermediate[split2==0] label_cv = np.zeros((X_cv.shape[0],), dtype=np.int) - 1 split3 = produce_spliting_array(pois.shape[0], .5 ) X_test = np.vstack((X_test, pois[split3==1])) label_test = np.hstack((label_test, np.ones(sum(split3), dtype=np.int))) X_cv = np.vstack((X_cv, pois[split3==0])) label_cv = np.hstack((label_cv, np.ones(sum(split3==0), dtype=np.int))) print "size X_train", X_train.shape print "size test data", X_test.shape, label_test.shape print "size cv data", X_cv.shape, label_cv.shape print "size splits", len(split1), len(split2), len(split3) from sklearn.covariance import EllipticEnvelope detector = EllipticEnvelope(contamination=.85) detector.fit(X_train) pred_cv = detector.predict(X_cv) print pred_cv print label_cv print detector.score(X_cv, label_cv)
def labelValidSkeletons_old(skeletons_file, good_skel_row, fit_contamination = 0.05): base_name = getBaseName(skeletons_file) progress_timer = timeCounterStr(''); print_flush(base_name + ' Filter Skeletons: Starting...') with pd.HDFStore(skeletons_file, 'r') as table_fid: trajectories_data = table_fid['/trajectories_data'] trajectories_data['is_good_skel'] = trajectories_data['has_skeleton'] if good_skel_row.size > 0: #nothing to do if there are not valid skeletons left. print_flush(base_name + ' Filter Skeletons: Reading features for outlier identification.') #calculate classifier for the outliers nodes4fit = ['/skeleton_length', '/contour_area'] + \ ['/' + name_width_fun(part) for part in worm_partitions] X4fit = nodes2Array(skeletons_file, nodes4fit, good_skel_row) assert not np.any(np.isnan(X4fit)) #%% print_flush(base_name + ' Filter Skeletons: Fitting elliptic envelope. Total time:' + progress_timer.getTimeStr()) #TODO here the is a problem with singular covariance matrices that i need to figure out how to solve clf = EllipticEnvelope(contamination = fit_contamination) clf.fit(X4fit) print_flush(base_name + ' Filter Skeletons: Calculating outliers. Total time:' + progress_timer.getTimeStr()) #calculate outliers using the fitted classifier X = nodes2Array(skeletons_file, nodes4fit) #use all the indexes y_pred = clf.decision_function(X).ravel() #less than zero would be an outlier print_flush(base_name + ' Filter Skeletons: Labeling valid skeletons. Total time:' + progress_timer.getTimeStr()) #labeled rows of valid individual skeletons as GOOD_SKE trajectories_data['is_good_skel'] = (y_pred>0).astype(np.int) #Save the new is_good_skel column saveModifiedTrajData(skeletons_file, trajectories_data) print_flush(base_name + ' Filter Skeletons: Finished. Total time:' + progress_timer.getTimeStr())
def transform( features, labels ): # for ff, ll in zip(features, labels): # print ll, ff # for rr in range(0, len(features) ): # features[rr] = scaler.fit_transform( features[rr] ) print "transforming features via pca" pca = PCA(n_components = 30) features = pca.fit_transform( features ) envelope = EllipticEnvelope() envelope.fit( features ) print envelope.predict( features ) scaler = MinMaxScaler() features = scaler.fit_transform( features ) return features, labels
def detect_outliers(X, station): if station=='hoerning': outlierfraction = 0.0015 classifier = svm.OneClassSVM(nu=0.95*outlierfraction + 0.05, kernel='rbf', gamma=0.1) Xscaler = StandardScaler(copy=True, with_mean=True, with_std=True).fit(X) X_scaled = Xscaler.transform(X) classifier.fit(X_scaled) svcpred = classifier.decision_function(X_scaled).ravel() threshold = stats.scoreatpercentile(svcpred, 100*outlierfraction) inlierpred = svcpred>threshold else: outlierfraction = 0.0015 classifier = EllipticEnvelope(contamination=outlierfraction) classifier.fit(X) gausspred = classifier.decision_function(X).ravel() threshold = stats.scoreatpercentile(gausspred, 100*outlierfraction) inlierpred = gausspred>threshold return inlierpred
def find_outlier_test_homes(df,all_homes, appliance, outlier_features, outliers_fraction=0.1): from scipy import stats from sklearn import svm from sklearn.covariance import EllipticEnvelope clf = EllipticEnvelope(contamination=.1) try: X = df.ix[all_homes[appliance]][outlier_features].values clf.fit(X) except: try: X = df.ix[all_homes[appliance]][outlier_features[:-1]].values clf.fit(X) except: try: X = df.ix[all_homes[appliance]][outlier_features[:-2]].values clf.fit(X) except: print "outlier cannot be found" return df.ix[all_homes[appliance]].index.tolist() y_pred = clf.decision_function(X).ravel() threshold = stats.scoreatpercentile(y_pred, 100 * outliers_fraction) y_pred = y_pred > threshold return df.ix[all_homes[appliance]][~y_pred].index.tolist()
def CovEstOD(data, classifier=None, N=1, **kw): if classifier is None: from sklearn.covariance import EllipticEnvelope contamination = N / data.shape[0] classifier = EllipticEnvelope(support_fraction=1., contamination=contamination) classifier.fit(data) clipix, = np.where( classifier.predict(data) == -1) wdb = kw.pop( 'with_decision_boundary', False ) #TODO: A better way of finding the decision boundary if wdb: w,T = np.linalg.eigh( clf.precision_ ) #T (eigenvectors of precision matrix) is the transformation matrix between principle axes and data coordinates Ti = np.linalg.inv(T) M = np.dot(Ti, clf.precision_) * T #Diagonalizing the precision matrix ==> quadratic representation of decision boundary (ellipse): z^T M z = threshold. where x-<x> = Tz transforms to principle axes a, b = np.sqrt(clf.threshold / np.diag(M)) #semi-major & semi-minor axes theta = np.degrees( np.arccos(T[0,0]) ) #T is (im)proper rotation matrix theta = np.linalg.det(T) * theta #If det(T)=-1 ==> improper rotation matrix (rotoinversion - one of the axes is inverted) decision_boundary = Ellipse( clf.location_, 2*a, 2*b, theta, color='m' ) return clipix, decision_boundary else: return clipix
def find_outlier_train(ser, outliers_fraction=0.1, min_units=0.2): # Returns outlier, inliers X = ser[ser>min_units].reshape(-1,1) #is_normal_data = is_normal(ser) # FOR NOW only using Robust estimator of Covariance is_normal_data = True if is_normal_data: # Use robust estimator of covariance from sklearn.covariance import EllipticEnvelope clf = EllipticEnvelope(contamination=.1) else: #Data is not normally distributed, use OneClassSVM based outlier detection from sklearn import svm clf = svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05, kernel="rbf", gamma=0.1) from scipy import stats clf.fit(X) y_pred = clf.decision_function(X).ravel() threshold = stats.scoreatpercentile(y_pred, 100 * outliers_fraction) y_pred = y_pred > threshold return ser[ser>min_units][~y_pred], ser[ser>min_units][y_pred]
colors = plt.cm.Blues(np.linspace(0, 1, len(set(L)))) plt.figure(15) for l in set(L): p = (L == l) if l == -1: color = 'r' else: color = colors[l] plt.plot(rcp_concat[p, 0], rcp_concat[p, 1], 'o', c=color, markersize=10) plt.show() # -17- # anom_perc = 20 # original 20 clf = EllipticEnvelope(contamination=.1) clf.fit(rcp_concat) clf.decision_function(rcp_concat).ravel() pred = clf.decision_function(rcp_concat).ravel() threshold = stats.scoreatpercentile(pred, anom_perc) Anom = pred > threshold print(Anom) Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) plt.figure(16) plt.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7), cmap=plt.cm.Blues_r) plt.contour(xx, yy, Z, levels=[threshold], linewidths=2, colors='red') plt.plot(rcp_concat[:, 0], rcp_concat[:, 1], 'ko') plt.show() plt.savefig("../imagens/anomaly/ex17_20.png") # End
plt.scatter( classical_md, robust_md, color = "green", alpha = 0.5 ); plt.title( "BBAutoTune \n\n Real Robot Forward Motion MD versus RD" ); plt.xlabel( "Mahalanobis Distance (MD)" ); plt.ylabel( "Robust Distance (RD)" ); #plt.plot( [ min( classical_md ), max( classical_md ) ], [ min( classical_md ), max( classical_md ) ], color = "red", alpha = 0.5 ); # Try the elliptical envelope now with the outliers gone. print "EE:"; ssp = numpy.array( [ [ -10, 25.0, 0.0 ] ] ); print "Sample simulated point [[X',Y',T']]: ", ssp; ee = EllipticEnvelope( assume_centered = False, contamination = 0.0 ); print "With outliers:"; print "In envelope? ", ee.fit( forward_motion ).predict( ssp ); print "MD: ", math.sqrt( ee.fit( forward_motion ).mahalanobis( ssp ) ); print "Without outliers:"; print "In envelope? ", ee.fit( forward_motion_clean ).predict( ssp ); print "MD: ", math.sqrt( ee.fit( forward_motion_clean ).mahalanobis( ssp ) ); # Show the plots. plt.show( );
# Compare given classifiers under given settings xx, yy = np.meshgrid(np.linspace(-0.1, 1.1, 1000), np.linspace(0, 100, 1000)) n_inliers = int((1. - outliers_fraction) * n_samples) n_outliers = int(outliers_fraction * n_samples) # Fit the problem with varying cluster separation np.random.seed(42) # Data generation # Fit the model with the One-Class SVM #plt.figure(figsize=(10, 5)) clf = EllipticEnvelope(contamination=.1) # fit the data and tag outliers clf.fit(XY) y_pred = clf.decision_function(XY).ravel() threshold = stats.scoreatpercentile(y_pred, 100 * outliers_fraction) y_pred = y_pred > threshold # plot the levels lines and the points Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) subplot = ax[i] subplot.contourf(xx, yy, Z, levels=np.linspace(Z.min(), threshold, 7), cmap=plt.cm.Blues_r) a = subplot.contour(xx, yy, Z, levels=[threshold], linewidths=2, colors='red') subplot.contourf(xx, yy, Z, levels=[threshold, Z.max()], colors='orange') b = subplot.scatter(XY[:-n_outliers, 0], XY[:-n_outliers, 1], c='white')
def outliers_from_ellipticEnvelope(): from sklearn.covariance import EllipticEnvelope env=EllipticEnvelope() env.fit(features_pca) outlier_pred=env.decision_function(features_pca).ravel() return outlier_pred
from sklearn.cluster import KMeans import numpy as np from sklearn.decomposition import PCA import matplotlib.pyplot as plt import csv from sklearn import svm from sklearn.covariance import EllipticEnvelope from scipy import stats data=[] with open('newdata.csv', 'rb') as f: rdr=csv.reader(f) for row in rdr: data.append([int(row[1]), int(row[2])]) data=np.array(data) # print(data) outliers_fraction = 0.05 # est=svm.OneClassSVM(nu=0.95 * outliers_fraction + 0.05,kernel="rbf", gamma=0.1) est=EllipticEnvelope(contamination=.1) # est=KMeans(n_clusters=3) est.fit(data) # labels=est.labels_ y_pred=est.decision_function(data).ravel() threshold = stats.scoreatpercentile(y_pred, 100 * outliers_fraction) labels=[ (2 if y>threshold else 1) for y in y_pred]; # labels=est.labels_ print(labels) plt.scatter(data[:,0], data[:,1], c=labels, lw=0) plt.show()
# label=target_name.decode('utf8') ) x, y = find_boundary(X_transformed[kclusters == i, 0], X_transformed[kclusters == i, 1], 5) plt.plot(x, y, '-k', lw=2., color=cluster_color) # create a mesh to plot in h = .02 # step size in the mesh x_min, x_max = X_transformed[kclusters == i, 0].min() - 1, X_transformed[kclusters == i, 0].max() + 1 y_min, y_max = X_transformed[kclusters == i, 1].min() - 1, X_transformed[kclusters == i, 1].max() + 1 xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h)) clf = EllipticEnvelope(contamination=.1) clf.fit(X_transformed[kclusters == i]) pred = clf.decision_function(X_transformed[kclusters == i]).ravel() threshold = stats.scoreatpercentile(pred, 100 * outliers_fraction) print("INFO: Cluster: ", i, " Threshold: ", threshold) Z = clf.decision_function(np.c_[xx.ravel(), yy.ravel()]) Z = Z.reshape(xx.shape) # plt.contour(xx, yy, Z, # levels=[threshold], # linewidths=2, # linestyles='solid', # colors=(cluster_color,))
# # print(Y) # Find outliers in the interaction rate data # Step 1 - Convert the dataset into pandas series util = Utility.SeriesUtility() datasetFileName = "fans_change_taylor_swift.csv" series = util.convertDatasetsToSeries(datasetFileName) series = util.resampleSeriesSum(series, "D") numberOfPoints = series.data.shape[0] X = series.values.flatten().reshape(numberOfPoints,1) det.fit(X) predicted = det.predict(X) for i in range(numberOfPoints): outputClass = det.predict(X[i])[0] if(outputClass == -1): print("Outlier detected...")
def find_outliers(datestart,dateend,plot=False,cut=-0.05): numtopics=84 di=datetime2str2(datestart) dfin=datetime2str2(dateend) #print di,dfin if dfin<di: temp=dfin dfin=di di=temp #print di,dfin afile="/home/ubuntu/mysql_insightwiki_auth.txt" a=open(afile) passwd=a.readline().rstrip() a.close() host='localhost'; user='******';db='wikidata' con = mdb.connect(host, user, passwd, db)#,port=3307) with con: curt= con.cursor() #sql="SELECT COUNT(*) FROM `topics` " sql="SELECT `Id`,`topic_label`,`topic_string` FROM `topics`;" curt.execute(sql) topics=[[0,'nothing','Filler to match index']] for topic in curt: topics.append(topic) data={} df=range(numtopics+1) with con: curt= con.cursor() sql="SELECT `Id`,`topic_label`,`topic_string` FROM `topics`;" curt.execute(sql) for row in curt: cur = con.cursor() sql='''SELECT `page_views`.`dateonly` AS `vd`, AVG(`page_views`.`count`) AS `vc`, `topics`.`topic_label`,`topics`.`topic_string` FROM `topics` INNER JOIN `page_views` ON `topics`.`ID` = `page_views`.`topic_id` WHERE `topic_id`=%s GROUP BY `page_views`.`dateonly` ''' data[row[1]]=read_sql(sql, con,params=[row[0]]) df[row[0]]=data[row[1]] topicdata=df d=topicdata[topics[3][0]] p=d[ (d['vd']>di) & (d['vd']<dfin )]['vc'].values topicdata=df #initializing array to hold the rows to cluster #the 0th position is fake so that my index matches the sql index clusinp=[] clusinp.append(gen_feat([0,0,0,0,0])) chinaoff=6000 #populating my array to go into my Kmean for index,topic in enumerate(topics): #topic=list(topics[index]) if topic[0]!=0: d=topicdata[topic[0]] ppre=d[ (d['vd']>di) & (d['vd']<dfin )]['vc'].values p=gen_feat(ppre) if topic[0]==52: p=gen_feat([x-chinaoff if x-chinaoff>=0 else 0 for x in ppre ]) clusinp.append(p) #cleaning up my array making it numpy to go into my kmean clusinp=np.array(clusinp) clusinp[0]=clusinp[5] #making sure my through away first row matches in size #contam=0.325 contamfix=0.1 colors = ['m', 'g', 'b'] X1=clusinp xx1, yy1 = np.meshgrid(np.linspace(0, 10000, 500), np.linspace(-1.5, 1.5, 500)) ee=EllipticEnvelope(support_fraction=1., contamination=contamfix) #ee=OneClassSVM(nu=contam2, gamma=0.05,kernel='rbf') ee.fit(clusinp) outliers=ee.decision_function(X1, raw_values=False) if plot==True: print "here" get_ipython().magic(u'matplotlib inline') Z1 = ee.decision_function(np.c_[xx1.ravel(), yy1.ravel()]) Z1 = Z1.reshape(xx1.shape) legend1 = plt.contour(xx1, yy1, Z1, levels=[0], linewidths=2, colors=colors[1]) plt.scatter(X1[:, 0], X1[:, 1], color='black') plt.xlim((xx1.min(), xx1.max())) plt.ylim((yy1.min(), yy1.max())) plt.show() out=[] for index,outlier in enumerate(outliers): row=[index,outlier,topics[index][1],int(np.round(clusinp[index][0])),int(np.round(100*clusinp[index][1]))] #row=[index,outlier,topics[index][1],int(np.round(clusinp[index][0])),clusinp[index][1]] if outlier<cut and index!=0 and row[3]>8: out.append(row) #print index,outlier,topics[index][2],clusinp[index][0],clusinp[index][1] #out=sorted(out,operator.itemgetter(4)) #out.sort() out=sorted(out,key =lambda x:-x[4]) return out
'Race-Black', 'Age', 'HAART-Naive', 'HAART-Non-Adherent', 'HAART-Off', 'HAART-On', 'Hepatitis C status (HCV)'] for col in tranfer_cols: _, cyto_data[col] = cyto_data.align(pat_data[col], join='left', axis = 0) cyto_data['HCV'] = cyto_data['Hepatitis C status (HCV)'] # <codecell> for col in cytos: env = EllipticEnvelope(contamination=0.05) env.fit(cyto_data[col].dropna().values.reshape(-1, 1)) mask = env.predict(cyto_data[col].values.reshape(-1,1)) cyto_data[col][mask==-1] = np.nan # <codecell> fig, axs = plt.subplots(11,3, figsize = (10,20)) for ax, col in zip(axs.flatten(), cytos): boxes = [] mus = [] stds = [] for trop in trops: mask = cyto_data['Tropism'] == trop