def outlier_dates(cy, alp, all_pollutants_df, pol_list): ''' cy = county alp = alpha value for outlier function ''' from outliers import smirnov_grubbs as grubbs county_pollutants_df = county_pollutants(cy, all_pollutants_df, pol_list) pollutant_info = { 'co': ['ppm', 'Carbon monoxide'], 'no2': ['ppb', 'Nitrogen dioxide (NO2)'], 'ozone': ['ppm', 'Ozone'], 'pb': ['ug/m3', 'Lead'], 'pm2_5': ['ug/m3', 'PM2.5'], 'pm10': ['ug/m3', 'PM10'], 'so2': ['ppb', 'Sulfur dioxide'] } for p in pol_list: outliers = grubbs.max_test_outliers(list(county_pollutants_df[p]), alpha=alp) #When did this happen? if len(outliers) != 0: d = str(county_pollutants_df[county_pollutants_df[p] == outliers[0]].index[0]) print('The %s %s outlier occured on %s' % (cy, pollutant_info[p][1], d[0:10]))
def getStatistics(list): df = pd.DataFrame(list) statsm = {} mean, var, std = stats.bayes_mvs(df, alpha=0.95) if math.isnan(mean[0]): statsm['mean'] = 0 else: statsm['mean'] = int(mean[0]) if math.isnan(var[0]): statsm['var'] = 0 else: statsm['var'] = int(var[0]) if math.isnan(std[0]): statsm['std'] = 0 else: statsm['std'] = int(std[0]) out = grubbs.max_test_outliers(list, alpha=0.05) if out: statsm['outlier'] = numpy.amax(out) else: statsm['outlier'] = 0 return statsm
def test_grubbs(rv, N=200, alpha=0.05, N_reps=1000): n_fa = 0 val_fa = [] for rep in range(N_reps): outliers = grubbs.max_test_outliers(rv.rvs(size=N), alpha=alpha) if len(outliers) > 0: val_fa += outliers n_fa += 1 return n_fa / N_reps, val_fa
def outlier_test(values): ''' Inputs: Absorbance to be used in outlier test Outputs: If outlier exists, will print "outlier exists" statment + value that is deemed an outlier. If no outlier exists, "No outlier" statement is printed. ''' relstdev = 100 * (np.std(values) / np.mean(values)) if (relstdev > 10): outlier = grubbs.max_test_outliers(values, alpha=.05) return print('Outlier exists:', outlier) else: print('No outlier')
def grubbs_cal(inputList, significance=0.05): result = grubbs.max_test_outliers(inputList, alpha=significance) print(result)
#Check if age is missing at random print(pd.crosstab(train.loc[train.Age.isnull()]['Survived'], train.loc[train.Age.isnull()]['Pclass'], rownames=["Rows Missing Age by Survived and Pclass"])) #-------------------------------------------------------------------------------------------------# #---------------------------------Dealing with Outliers-------------------------------------------# #-------------------------------------------------------------------------------------------------# #Boxplots for numeric variables numeric_cols = [col for col in train.columns if train[col].dtype == 'float64'] for col in numeric_cols: sns.boxplot(y=train[~np.isnan(train[col])][col]) plt.title("Box Plot for " + col) plt.show() #Grubbs test (note - this is the generalized extreme studentized deviates test/iterative Grubbs) print(train.loc[train.Fare == grubbs.max_test_outliers(train['Fare'], alpha=0.05)[0]]) #Compute fare per person, since some passengers bought group tickets producing fares that are sums of the individual ticket prices train['Set'] = 'train' test['Set'] = 'test' alldata = pd.concat([train.drop(['Survived'], axis=1), test], ignore_index=True) alldata['Group_Size'] = alldata.groupby(['Fare', 'Ticket'])['PassengerId'].transform("count") alldata['Fare_Per_Person'] = alldata.Fare/alldata.Group_Size #Plot fare by passenger class sns.boxplot(y=alldata[alldata.Pclass==1]['Fare_Per_Person'].values) plt.title("Box Plot for Fare Per Person - First Class") plt.show() sns.boxplot(y=alldata[alldata.Pclass==2]['Fare_Per_Person'].values) plt.title("Box Plot for Fare Per Person - Second Class") plt.show()
def test_one_sided_max_outlier_detection(self): outliers = grubbs.max_test_outliers(self.rvs, alpha=self.default_alpha) self.assertIn(self.rvs.max(), outliers) self.assertNotIn(self.rvs.min(), outliers)
def get_max_outliers(self,alpha=0.05): data = pd.Series(self.window) result = grubbs.max_test_outliers(data, alpha=alpha) return result
def outliersTest(list): return grubbs.max_test_outliers(list, alpha=0.05)
pval_thr = 0.00001 nd_regions = pd.read_csv(f, sep=',') ok = nd_regions.type[~nd_regions['Mean.Rho.bp.'].isnull()].value_counts() ok = ok[ok >= 10].index filter_data = nd_regions[nd_regions.type.isin(ok)].copy() # fill na mmin = filter_data.NucDiv.min() filter_data.NucDiv.fillna(mmin, inplace=True) ################# outliers by region ################## #found outliers by region gb = filter_data.groupby(['type'])['Mean.Rho.bp.'] outliers_by_region = gb.apply( lambda x: grubbs.max_test_outliers(x.dropna(), alpha=pval_thr)) outliersByReg = [] for ty in outliers_by_region.index: ii = filter_data[(filter_data.type == ty) & ( filter_data['Mean.Rho.bp.'].isin(outliers_by_region[ty]))].index.values if len(ii) > 0: outliersByReg.append(ii[0]) outlier_data = filter_data.loc[outliersByReg, ] filter_data = filter_data.drop(outliersByReg) mm = filter_data.groupby(['type'])[[ 'NucDiv', 'GC', 'Mean.Rho.bp.', u'X.95..CI', u'X.95..CI.1', 'length' ]].mean()