def test_categorical_nans(self): s = Series(pd.Categorical(list('aaaaabbbcc'))) # 4,3,2,1 (nan) s.iloc[1] = np.nan result = s.value_counts() expected = pd.Series([4, 3, 2], index=pd.CategoricalIndex( ['a', 'b', 'c'], categories=['a', 'b', 'c'])) tm.assert_series_equal(result, expected, check_index_type=True) result = s.value_counts(dropna=False) expected = pd.Series([ 4, 3, 2, 1 ], index=pd.CategoricalIndex(['a', 'b', 'c', np.nan])) tm.assert_series_equal(result, expected, check_index_type=True) # out of order s = Series(pd.Categorical( list('aaaaabbbcc'), ordered=True, categories=['b', 'a', 'c'])) s.iloc[1] = np.nan result = s.value_counts() expected = pd.Series([4, 3, 2], index=pd.CategoricalIndex( ['a', 'b', 'c'], categories=['b', 'a', 'c'], ordered=True)) tm.assert_series_equal(result, expected, check_index_type=True) result = s.value_counts(dropna=False) expected = pd.Series([4, 3, 2, 1], index=pd.CategoricalIndex( ['a', 'b', 'c', np.nan], categories=['b', 'a', 'c'], ordered=True)) tm.assert_series_equal(result, expected, check_index_type=True)
class ValueCounts(object): params = ['int', 'float', 'object'] param_names = ['dtype'] def setup(self, dtype): self.s = Series(np.random.randint(0, 1000, size=100000)).astype(dtype) def time_value_counts(self, dtype): self.s.value_counts()
def test_value_counts_bins(self): klasses = [Index, Series] for klass in klasses: s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a'] s = klass(s_values) # bins self.assertRaises(TypeError, lambda bins: s.value_counts(bins=bins), 1) s1 = Series([1, 1, 2, 3]) res1 = s1.value_counts(bins=1) exp1 = Series({0.998: 4}) tm.assert_series_equal(res1, exp1) res1n = s1.value_counts(bins=1, normalize=True) exp1n = Series({0.998: 1.0}) tm.assert_series_equal(res1n, exp1n) self.assert_numpy_array_equal(s1.unique(), np.array([1, 2, 3], dtype=np.int64)) self.assertEqual(s1.nunique(), 3) res4 = s1.value_counts(bins=4) exp4 = Series({0.998: 2, 1.5: 1, 2.0: 0, 2.5: 1}, index=[0.998, 2.5, 1.5, 2.0]) tm.assert_series_equal(res4, exp4) res4n = s1.value_counts(bins=4, normalize=True) exp4n = Series( {0.998: 0.5, 1.5: 0.25, 2.0: 0.0, 2.5: 0.25}, index=[0.998, 2.5, 1.5, 2.0]) tm.assert_series_equal(res4n, exp4n) # handle NA's properly s_values = ['a', 'b', 'b', 'b', np.nan, np.nan, 'd', 'd', 'a', 'a', 'b'] s = klass(s_values) expected = Series([4, 3, 2], index=['b', 'a', 'd']) tm.assert_series_equal(s.value_counts(), expected) exp = np.array(['a', 'b', np.nan, 'd'], dtype=np.object_) self.assert_numpy_array_equal(s.unique(), exp) self.assertEqual(s.nunique(), 3) s = klass({}) expected = Series([], dtype=np.int64) tm.assert_series_equal(s.value_counts(), expected, check_index_type=False) # returned dtype differs depending on original self.assert_numpy_array_equal(s.unique(), np.array([]), check_dtype=False) self.assertEqual(s.nunique(), 0)
def test_categorical(self): s = Series(pd.Categorical(list('aaabbc'))) result = s.value_counts() expected = pd.Series([3, 2, 1], index=pd.CategoricalIndex(['a', 'b', 'c'])) tm.assert_series_equal(result, expected, check_index_type=True) # preserve order? s = s.cat.as_ordered() result = s.value_counts() expected.index = expected.index.as_ordered() tm.assert_series_equal(result, expected, check_index_type=True)
def test_value_counts(self): s = Series(['a', 'b', 'b', 'b', 'b', 'a', 'c', 'd', 'd', 'a']) hist = s.value_counts() expected = Series([4, 3, 2, 1], index=['b', 'a', 'd', 'c']) assert_series_equal(hist, expected) # handle NA's properly s[5:7] = np.nan hist = s.value_counts() expected = s.dropna().value_counts() assert_series_equal(hist, expected) s = Series({}) hist = s.value_counts() expected = Series([]) assert_series_equal(hist, expected)
def main(): path = 'usagov_bitly_data2013-05-17-1368832207' records = [json.loads(line) for line in open(path)] tzs = [rec['tz'] for rec in records if 'tz' in rec] counts = getCounts(tzs) print counts top10 = topCounts(counts) print top10 """Pandas DataFrame demo""" frame = DataFrame(records) clean_tz = frame['tz'].fillna('Missing') clean_tz[clean_tz == ''] = 'Unknown' tz_counts = clean_tz.value_counts() print tz_counts[:10] tz_counts[:10].plot(kind='barh', rot=0) plt.show() """Pandas Series demo""" results = Series([x.split()[0] for x in frame['a'].dropna()]) agents_counts = results.value_counts() print agents_counts[:8] cframe = frame[frame['a'].notnull()] os_seq = np.where(cframe['a'].str.contains('Windows'), 'Windows', 'Not Windows') print os_seq[:8] by_tz_os = cframe.groupby(['tz', os_seq]) agg_counts = by_tz_os.size().unstack().fillna(0) print agg_counts[:8] indexer = agg_counts.sum(1).argsort() print indexer[:8] counts_subset = agg_counts.take(indexer)[:10] counts_subset.plot(kind='barh', stacked=True) plt.show()
def count_fun4(recrods): frame = DataFrame(recrods) results = Series([x.split()[0] for x in frame.a.dropna()]) print results[:5] counts = results.value_counts()[:10] counts.plot(kind='barh', rot=0) plt.show()
def top10(tokens, text): obj = Series(tokens) top10 = obj.value_counts()[:10] print(top10) top10_list = list(top10.keys()) text.dispersion_plot(top10_list) return top10_list
def test_categorical_zeroes(self): # keep the `d` category with 0 s = Series(pd.Categorical( list('bbbaac'), categories=list('abcd'), ordered=True)) result = s.value_counts() expected = Series([3, 2, 1, 0], index=pd.Categorical( ['b', 'a', 'c', 'd'], categories=list('abcd'), ordered=True)) tm.assert_series_equal(result, expected, check_index_type=True)
class Algorithms(object): params = ['index', 'series'] param_names = ['typ'] def setup(self, typ): data = [Period('2011-01', freq='M'), Period('2011-02', freq='M'), Period('2011-03', freq='M'), Period('2011-04', freq='M')] if typ == 'index': self.vector = PeriodIndex(data * 1000, freq='M') elif typ == 'series': self.vector = Series(data * 1000) def time_drop_duplicates(self, typ): self.vector.drop_duplicates() def time_value_counts(self, typ): self.vector.value_counts()
class period_algorithm(object): goal_time = 0.2 def setup(self): data = [Period('2011-01', freq='M'), Period('2011-02', freq='M'), Period('2011-03', freq='M'), Period('2011-04', freq='M')] self.s = Series(data * 1000) self.i = PeriodIndex(data, freq='M') def time_period_series_drop_duplicates(self): self.s.drop_duplicates() def time_period_index_drop_duplicates(self): self.i.drop_duplicates() def time_period_series_value_counts(self): self.s.value_counts() def time_period_index_value_counts(self): self.i.value_counts()
def main(): path = '1_usagov_bitly_data2012-03-16-1331923249.txt' records = [json.loads(line) for line in open(path)] frame = DataFrame(records) tz_counts = frame['tz'].value_counts() print "Top timezones:" print tz_counts[:10] print "" clean_tz = frame['tz'].fillna('Missing') clean_tz[clean_tz == ''] = 'Unknown' tz_counts2 = clean_tz.value_counts() print "Cleaned top timezones:" print tz_counts2[:10] print "" agents = Series([x.split()[0] for x in frame['a'].dropna()]) print "Top User Agents:" print agents.value_counts()[:10] print cframe = frame[frame['a'].notnull()] operating_system = np.where( cframe['a'].str.contains('Windows'), 'Windows', 'Not Windows' ) by_timezone_os = cframe.groupby(['tz', operating_system]) agg_counts = by_timezone_os.size().unstack() agg_counts.fillna(0, inplace=True) timezone_totals = agg_counts.sum(1).argsort() count_subset = agg_counts.take(timezone_totals)[-10:] print "OS split by top timezones by counts:" print count_subset print ""
class Algorithms(object): goal_time = 0.2 def setup(self): data = [ Period("2011-01", freq="M"), Period("2011-02", freq="M"), Period("2011-03", freq="M"), Period("2011-04", freq="M"), ] self.s = Series(data * 1000) self.i = PeriodIndex(data, freq="M") def time_drop_duplicates_pseries(self): self.s.drop_duplicates() def time_drop_duplicates_pindex(self): self.i.drop_duplicates() def time_value_counts_pseries(self): self.s.value_counts() def time_value_counts_pindex(self): self.i.value_counts()
def Main(): client = github_helpers.authenticate() keywords = raw_input("Please, enter keywords to search repositories: ") if keywords is '': keywords = 'javascript' print 'No keywords provided. It will use the keyword: ' + keywords search = client.search_repositories(keywords) first_page = search.get_page(0) languages = Series(r.language for r in first_page) languages = languages.dropna() languages.sort() percentages = (100.0 * languages.value_counts() / len(languages)).map('{:,.2f} %'.format) print 'Languages percentage:' print percentages # Create plot x = [int(r.stargazers_count) for r in first_page] y = [int(r.forks) for r in first_page] # Add one to every value for logarithmic scale x = [val + 1 for val in x] y = [val + 1 for val in y] area = [100 for r in first_page] names = [r.name for r in first_page] colors = np.random.rand(len(first_page)) pl.scatter(x, y, s=area, c=colors, alpha=0.5) for i in range(0, len(x)): pl.annotate(names[i], (x[i], y[i]), fontsize=2) pl.title("All values are with addition of 1 (for the logarithmic scale)") pl.xlabel("Stars") pl.xscale("log") pl.yscale("log") pl.ylabel("Forks") pl.tight_layout() filepath = 'reports/APIs/github' if not os.path.isdir(filepath): os.makedirs(filepath) filepath += '/search_repositories.png' pl.savefig(filepath, figsize=(1020, 1020), dpi=300) pl.close() print('A chart with high resolution and small font size (to minimize overlaps) was created at ' + filepath)
def analysis2(records): print('\nPandas analysis >>') frame = DataFrame(records) # pp.pprint(frame) # tz_counts = frame['tz'].value_counts() # pp.pprint(tz_counts[:10]) clean_tz = frame['tz'].fillna('Missing') clean_tz[clean_tz == ''] = 'Unknown' tz_counts = clean_tz.value_counts() print('\n>> Time zones') pp.pprint(tz_counts[:10]) # plt.figure(figsize=(10, 4)) # tz_counts[:10].plot(kind='barh', rot=0) # plt.show() clean_url = frame.a.fillna('Missing') clean_url[clean_url == ''] = 'Unknown' urls = Series([x.split()[0] for x in clean_url]) urls_counts = urls.value_counts() print('\n >> URLs') pp.pprint(urls_counts[:10]) cframe = frame[frame.a.notnull()] operation_system = np.where(cframe.a.str.contains('Windows'), 'Windows', 'Not Windows') print('\n>> OS') # print(operation_system[:5]) by_tz_os = cframe.groupby(['tz', operation_system]) agg_counts = by_tz_os.size().unstack().fillna(0) indexer = agg_counts.sum(1).argsort() count_subset = agg_counts.take(indexer)[-10:] pp.pprint(count_subset) plt.figure(figsize=(10, 4)) count_subset.plot(kind='barh', rot=0, stacked=True) plt.show()
classfier = GaussianNB() classfier.fit(features_train, target_train) prediction = classfier.predict(features_test) accuracy = accuracy_score(target_test, prediction, normalize=True) print 'accuracy generated by scikit-learn: {}%'.format(accuracy * 100) print '-' * 50 return features, target, accuracy, prediction sci_features, sci_target, sci_accuracy, sci_predictions = scikit_Gaussian( _dataset, split_ratio) #-- plotting predictions for our model against real counts: p_samples = Series(predictions) p_counts = p_samples.value_counts() t_sample = [item[-1] for item in test_set] t_samples = Series(t_sample[:len(predictions)]) t_counts = t_samples.value_counts() title = 'Comparing diabetes real classes counts to our model prediction' legends = ['Our model', 'Real data'] plotting_data(t_counts, p_counts, title, legends) #-- plotting predictions for sci-kit learn model against real counts: p_samples = Series(sci_predictions) p_counts = p_samples.value_counts() t_samples = Series(sci_target[-len(sci_predictions):]) t_counts = t_samples.value_counts() title = 'Comparing diabetes real classes counts to sci-kit learn model prediction' legends = ['Sci-kit model', 'Real data'] plotting_data(t_counts, p_counts, title, legends)
print('-'*50) print(myFrame.sum(axis = 1)) print('-'*50) print(myFrame.cumsum()) #누산메소드 - 누적합을 구해줌 print('-'*50) print(myFrame.mean(axis = 0)) print('-'*50) print(myFrame.mean(axis = 1)) print('-'*50) print(myFrame.mean(axis = 1, skipna = False)) print('-'*50) print(myFrame.describe()) #간단한 통계치 정보/%:4분위 데이터/std:표준편차 print('-'*50) print(myFrame.idxmax()) print('-'*50) mySeries = Series(['a', 'a', 'b', 'c', 'd'] * 2) #unique:중복되지 않는 데이터/top:빈도수가 가장 많은거/freq:top의 횟수 print(mySeries.describe()) print('-'*50) print(mySeries) myUnique = mySeries.unique() print(myUnique) print('-'*50) print(mySeries.value_counts()) print('-'*50) print(pd.value_counts(mySeries.values, sort=False)) print('-'*50) mask = mySeries.isin(['b','c']) print(mask) print(mySeries[mask])
for tic, data in all_data.iteritems()}) # percent changes of the prices: returns = price.pct_change() returns.tails() returns.tail() returns.MSFT.corr(returns.IBM) # correlation of the overlapping non-NA returns.MSFT.cov(returns.IBM) # covariance of the overlapping non-NA returns.corr() returns.cov() returns.corrwith(returns.IBM) returns.corrwith(volume) ## Unique values, Value counts, and membership obj = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c']) uniques = obj.unique() uniques obj.value_counts() obj.value_counts() # value frequencies from pandas import value_counts value_counts(obj.values, sort=False) obj mask = obj.isin(['b', 'c']) obj[mask] mask data = DataFrame({'Qu1': [1, 3, 4, 3, 4], 'Qu2': [2, 3, 1, 2, 3], 'Qu3': [1, 5, 2, 4, 4]}) data data.Qu1 data.Qu1.value_counts data.Qu1.value_counts() result = data.apply(value_counts).fillna(0)
tz_counts[:10] #画图 tz_counts[:10].plot(kind= 'barh', rot=0) frame['a'][49] results = Series([x.split()[0] for x in frame.a.dropna()]) #x.split()把字符串拆成列表 [0]表示取第几个分片 #DataFrame.dropna()删除缺失数据 #对于一个 Series,dropna 返回一个仅含非空数据和索引值的 Series。 #问题在于对 DataFrame 的处理方式,因为一旦 drop 的话,至少要丢掉一行(列)。这里的解决方式与前面类似,还是通过一个额外的参数:dropna(axis=0, how='any', thresh=None) ,how 参数可选的值为 any 或者 all。all 仅在切片元素全为 NA 时才抛弃该行(列)。另外一个有趣的参数是 thresh,该参数的类型为整数,它的作用是,比如 thresh=3,会在一行中至少有 3 个非 NA 值时将其保留。 results[:5] results.value_counts()[:8] #按Windows和非Windows用户对时区信息进行分解 #为了简单起见,我们假定只要agent字符中包含windows,就认为该用户为windows用户 #由于有agent缺失,先用notnull将它们从数据中移除 cframe = frame[frame.a.notnull()] # #is(not)null返回index和Boolean值 其实这个是numpy的Boolean值索引 #这一对方法对对象做元素级应用,然后返回一个布尔型数组,一般可用于布尔型索引。 operating_system = np.where(cframe['a'].str.contains('Windows'), 'Windows', 'Not Windows') operating_system[:9] operating_system1 = np.where(cframe['a'].str.contains('Windows')[, 'Windows', 'Not Windows'])
# csv 파일로 저장 import csv import pandas as pd try: f = csv.writer(open('ws1.csv', 'w', encoding='utf-8')) f.writerow(word_dict) except Exception as e: print('err : ', e) # df1 = pd.read_csv('ws1.csv', encoding='utf-8') # print(df1) with open('ws1.csv', 'r', encoding='utf-8') as f: print(f.read()) print() from pandas import Series, DataFrame li_data = Series(wordlist) #print(li_data) print(li_data.value_counts()[:5]) print() li_data = Series(word_dict) print(li_data.value_counts()[:5]) print('-----------------') df = DataFrame(wordlist, columns=['단어']) print(df.head()) ###############################################################
from pandas import Series, DataFrame import matplotlib.pyplot as plt array1 = np.array([[10, np.nan, 20], [30, 40, np.nan]]) print(array1) df1 = DataFrame(array1, index=[1, 2], columns=list('ABC')) print(df1) #sum() print(df1.sum()) print(df1.sum(axis=1)) #min print(df1.min()) print(df1.max()) print(df1.idxmax()) print(df1.idxmin()) print(df1.cumsum()) print(df1.describe()) df2 = DataFrame(randn(9).reshape(3, 3), index=[1, 2, 3], columns=list('ABC')) print(df2) plt.plot(df2) plt.legend(df2.columns, loc="lower right") plt.savefig('samplepic.png') plt.show() ser1 = Series(list('abcccaabd')) print(ser1.unique()) print(ser1.value_counts())
def fit(self, X: np.ndarray, Z: np.ndarray, clusters: pd.Series, y: np.ndarray): """ Fit MERF using EM algorithm. :param X (np.ndarray): fixed effect covariates :param Z (np.ndarray): random effect covariates :param clusters (pd.Series): cluster assignments for samples :param y (np.ndarray): response/target variable :return: fitted model """ if type(clusters) != pd.Series: raise TypeError("clusters must be a pandas Series.") # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Input Checks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ assert len(Z) == len(X) assert len(y) == len(X) assert len(clusters) == len(X) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Initialization ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ n_clusters = clusters.nunique() n_obs = len(y) q = Z.shape[1] # random effects dimension Z = np.array( Z ) # cast Z to numpy array (required if it's a dataframe, otw, the matrix mults later fail) # Create a series where cluster_id is the index and n_i is the value cluster_counts = clusters.value_counts() # Do expensive slicing operations only once Z_by_cluster = {} y_by_cluster = {} n_by_cluster = {} I_by_cluster = {} indices_by_cluster = {} # TODO: Can these be replaced with groupbys? Groupbys are less understandable than brute force. for cluster_id in cluster_counts.index: # Find the index for all the samples from this cluster in the large vector indices_i = clusters == cluster_id indices_by_cluster[cluster_id] = indices_i # Slice those samples from Z and y Z_by_cluster[cluster_id] = Z[indices_i] y_by_cluster[cluster_id] = y[indices_i] # Get the counts for each cluster and create the appropriately sized identity matrix for later computations n_by_cluster[cluster_id] = cluster_counts[cluster_id] I_by_cluster[cluster_id] = np.eye(cluster_counts[cluster_id]) # Intialize for EM algorithm iteration = 0 # Note we are using a dataframe to hold the b_hat because this is easier to index into by cluster_id # Before we were using a simple numpy array -- but we were indexing into that wrong because the cluster_ids # are not necessarily in order. b_hat_df = pd.DataFrame(np.zeros((n_clusters, q)), index=cluster_counts.index) sigma2_hat = 1 D_hat = np.eye(q) # vectors to hold history self.b_hat_history.append(b_hat_df) self.sigma2_hat_history.append(sigma2_hat) self.D_hat_history.append(D_hat) early_stop_flag = False while iteration < self.max_iterations and not early_stop_flag: iteration += 1 logger.debug("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~") logger.debug("Iteration: {}".format(iteration)) logger.debug("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~") # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ E-step ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # fill up y_star for all clusters y_star = np.zeros(len(y)) for cluster_id in cluster_counts.index: # Get cached cluster slices y_i = y_by_cluster[cluster_id] Z_i = Z_by_cluster[cluster_id] b_hat_i = b_hat_df.loc[cluster_id] # used to be ix logger.debug("E-step, cluster {}, b_hat = {}".format( cluster_id, b_hat_i)) indices_i = indices_by_cluster[cluster_id] # Compute y_star for this cluster and put back in right place y_star_i = y_i - Z_i.dot(b_hat_i) y_star[indices_i] = y_star_i # check that still one dimensional # TODO: Other checks we want to do? assert len(y_star.shape) == 1 # Do the fixed effects regression with all the fixed effects features self.fe_model.fit(X, y_star) f_hat = self.fe_model.predict(X) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ M-step ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ sigma2_hat_sum = 0 D_hat_sum = 0 for cluster_id in cluster_counts.index: # Get cached cluster slices indices_i = indices_by_cluster[cluster_id] y_i = y_by_cluster[cluster_id] Z_i = Z_by_cluster[cluster_id] n_i = n_by_cluster[cluster_id] I_i = I_by_cluster[cluster_id] # index into f_hat f_hat_i = f_hat[indices_i] # Compute V_hat_i V_hat_i = Z_i.dot(D_hat).dot(Z_i.T) + sigma2_hat * I_i # Compute b_hat_i V_hat_inv_i = np.linalg.pinv(V_hat_i) logger.debug( "M-step, pre-update, cluster {}, b_hat = {}".format( cluster_id, b_hat_df.loc[cluster_id])) b_hat_i = D_hat.dot(Z_i.T).dot(V_hat_inv_i).dot(y_i - f_hat_i) logger.debug( "M-step, post-update, cluster {}, b_hat = {}".format( cluster_id, b_hat_i)) # Compute the total error for this cluster eps_hat_i = y_i - f_hat_i - Z_i.dot(b_hat_i) logger.debug("------------------------------------------") logger.debug("M-step, cluster {}".format(cluster_id)) logger.debug("error squared for cluster = {}".format( eps_hat_i.T.dot(eps_hat_i))) # Store b_hat for cluster both in numpy array and in dataframe # Note this HAS to be assigned with loc, otw whole df get erroneously assigned and things go to hell b_hat_df.loc[cluster_id, :] = b_hat_i logger.debug( "M-step, post-update, recalled from db, cluster {}, " "b_hat = {}".format(cluster_id, b_hat_df.loc[cluster_id])) # Update the sums for sigma2_hat and D_hat. We will update after the entire loop over clusters sigma2_hat_sum += eps_hat_i.T.dot(eps_hat_i) + sigma2_hat * ( n_i - sigma2_hat * np.trace(V_hat_inv_i)) D_hat_sum += np.outer(b_hat_i, b_hat_i) + (D_hat - D_hat.dot( Z_i.T).dot(V_hat_inv_i).dot(Z_i).dot(D_hat)) # noqa: E127 # Normalize the sums to get sigma2_hat and D_hat sigma2_hat = (1.0 / n_obs) * sigma2_hat_sum D_hat = (1.0 / n_clusters) * D_hat_sum logger.debug("b_hat = {}".format(b_hat_df)) logger.debug("sigma2_hat = {}".format(sigma2_hat)) logger.debug("D_hat = {}".format(D_hat)) # Store off history so that we can see the evolution of the EM algorithm self.b_hat_history.append(b_hat_df.copy()) self.sigma2_hat_history.append(sigma2_hat) self.D_hat_history.append(D_hat) # Generalized Log Likelihood computation to check convergence gll = 0 for cluster_id in cluster_counts.index: # Get cached cluster slices indices_i = indices_by_cluster[cluster_id] y_i = y_by_cluster[cluster_id] Z_i = Z_by_cluster[cluster_id] I_i = I_by_cluster[cluster_id] # Slice f_hat and get b_hat f_hat_i = f_hat[indices_i] R_hat_i = sigma2_hat * I_i b_hat_i = b_hat_df.loc[cluster_id] # Numerically stable way of computing log(det(A)) _, logdet_D_hat = np.linalg.slogdet(D_hat) _, logdet_R_hat_i = np.linalg.slogdet(R_hat_i) gll += ((y_i - f_hat_i - Z_i.dot(b_hat_i)).T.dot( np.linalg.pinv(R_hat_i)).dot(y_i - f_hat_i - Z_i.dot(b_hat_i)) + b_hat_i.T.dot(np.linalg.pinv(D_hat)).dot(b_hat_i) + logdet_D_hat + logdet_R_hat_i) # noqa: E127 logger.info("GLL is {} at iteration {}.".format(gll, iteration)) self.gll_history.append(gll) # Early Stopping. This code is entered only if the early stop threshold is specified and # if the gll_history array is longer than 1 element, e.g. we are past the first iteration. if self.gll_early_stop_threshold is not None and len( self.gll_history) > 1: curr_threshold = np.abs( (gll - self.gll_history[-2]) / self.gll_history[-2]) logger.debug("stop threshold = {}".format(curr_threshold)) if curr_threshold < self.gll_early_stop_threshold: logger.info( "Gll {} less than threshold {}, stopping early ...". format(gll, curr_threshold)) early_stop_flag = True # Store off trained fixed effects model and b_hat as the model to be used in the prediction stage self.cluster_counts = cluster_counts self.trained_fe_model = self.fe_model self.trained_b = b_hat_df self.b_hat_history_df = self._convert_bhat_history(self.b_hat_history) return self
def main(): """ Calculation and aggregation of summary statistics """ # Summary of statistics # return is not ndarray df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index=list('abcd'), columns=['one', 'two']) print df print df.sum() print df.sum(axis=1) print df.mean(axis=1) # exclude nan print df.mean(axis=1, skipna=False) print df.idxmin() print df.idxmax() print df.cumsum() print df.describe() # values are not number obj = Series(list('aabc') * 4) print obj.describe() methods = ['count', 'min', 'max', # 'argmin', 'argmax', 'quantile', 'median', 'mad', 'var', 'std', 'skew', 'kurt', 'cummin', 'cummax', 'cumprod', 'diff', 'pct_change'] for method in methods: print u'「{0}」'.format(method) print getattr(df, method)() print '' # Correspond and Covariance all_data = {} lst = [] # ['AAPL', 'IBM', 'MSFT'] #, 'GOOG']: for ticket in lst: #, 'GOOG']: # IOError: after 3 tries, Yahoo! did not return a 200 # for url 'http://ichart.finance.yahoo.com/table.csv?s=GOOG&a=0&b=1&c=2000&d=0&e=1&f=2010&g=d&ignore=.csv' all_data[ticket] = pd.io.data.get_data_yahoo(ticket, '1/1/2000', '1/1/2010') price = DataFrame({tic: data['Adj Close'] for tic, data in all_data.iteritems()}) volume = DataFrame({tic: data['Volume'] for tic, data in all_data.iteritems()}) if all_data: returns = price.pct_change() print returns.tail() print '' print returns.MSFT.corr(returns.IBM) print returns.MSFT.cov(returns.IBM) print '' print returns.corr() print returns.cov() print '' print returns.corrwith(returns.IBM) print returns.corrwith(volume) # unique, frequency, belong print '','' obj = Series(list('cadaabbcc')) uniques = obj.unique() print uniques print obj.value_counts() print pd.value_counts(obj.values, sort=False) mask = obj.isin(['b', 'c']) print mask print obj[mask] data = DataFrame({ 'Qu1' : [1,3,4,3,4], 'Qu2' : [2,3,1,2,3], 'Qu3' : [1,5,2,4,4], }) print data print data.apply(pd.value_counts).fillna(0)
print(frame2.values[0, 2]) print() #frame3 = frame2.drop('d') frame3 = frame2.drop('d', axis=0) # d행 을 삭제 print(frame3) frame4 = frame2.drop('tel', axis=1) # tel 칼럼 열 삭제 print(frame4) print() print(frame3.sort_index(axis=0, ascending=False)) # 행단위로 정렬 ascending = false 시 내림차순 print(frame3.sort_index(axis=1, ascending=False)) # 열단위로 정렬 print(frame3.rank(axis=0)) print() print(frame3['juso'].value_counts()) #주소별 그룹핑 카운트 print() data = {'juso': ['강남구 역삼동', '중구 신당동', '강남구 대치동'], 'inwon': [23, 25, 15]} frame = DataFrame(data) print(frame) result1 = Series([x.split()[0] for x in frame.juso]) # juso 에서 구 단위로 자름 result2 = Series((x.split()[0] for x in frame.juso)) # juso 에서 구 단위로 자름 print(result1) print(result2) print(result2.value_counts()) # 구별로 데이터 카운트 출력
# missing values clean_tz = frame['tz'].fillna('Missing') clean_tz[clean_tz == ''] = 'Unknown' tz_counts = clean_tz.value_counts() print(tz_counts[:10]) # plot it import matplotlib tz_counts[:10].plot(kind='bar', rot=0) print(frame['a'][1]) # user agents agents = Series([x.split()[0] for x in frame.a.dropna()]) print(agents[:5]) print(agents.value_counts()[:8]) # OS import numpy as np cframe = frame[frame.a.notnull()] oses = np.where(cframe['a'].str.contains('Windows'), 'Windows', 'Non-Windows') print(oses[:5]) # grouping by_tz_os = cframe.groupby(['tz', oses]) agg_counts = by_tz_os.size().unstack().fillna(0) print(agg_counts[:10]) # top overall time zones indexer = agg_counts.sum(1).argsort() print(indexer[:10])
returns.IBM.head() returns.ix[:,2].corr(returns.ix[:,3]) returns.corr() returns.cov() returns.corrwith(returns.IBM) # returns.corrwith(volume) # 按列名匹配 # In[241]: #唯一值、值计数以及成员资格 obj = Series(['c','a','d ','a','a','b','b','c','c']) obj.unique() obj.value_counts(sort=False) obj.isin(['b','c']) # In[584]: import pandas as pd data=DataFrame({'qu1':[1,3,4,3,4], 'qu2':[2,3,2,2,3], 'qu3':[1,5,2,4,5]}) result = data.apply(pd.value_counts).fillna(0) result # In[594]:
def fit(self, values: pd.Series): self.levels = values.dropna().unique() self.encoder = values.value_counts().to_dict()
def fit_transform(self, values: pd.Series): self.levels = values.unique() self.encoder = values.value_counts().to_dict() return values.map(self.encoder)
def test_float64index_slicing_bug(self): # GH 5557, related to slicing a float index ser = {256: 2321.0, 1: 78.0, 2: 2716.0, 3: 0.0, 4: 369.0, 5: 0.0, 6: 269.0, 7: 0.0, 8: 0.0, 9: 0.0, 10: 3536.0, 11: 0.0, 12: 24.0, 13: 0.0, 14: 931.0, 15: 0.0, 16: 101.0, 17: 78.0, 18: 9643.0, 19: 0.0, 20: 0.0, 21: 0.0, 22: 63761.0, 23: 0.0, 24: 446.0, 25: 0.0, 26: 34773.0, 27: 0.0, 28: 729.0, 29: 78.0, 30: 0.0, 31: 0.0, 32: 3374.0, 33: 0.0, 34: 1391.0, 35: 0.0, 36: 361.0, 37: 0.0, 38: 61808.0, 39: 0.0, 40: 0.0, 41: 0.0, 42: 6677.0, 43: 0.0, 44: 802.0, 45: 0.0, 46: 2691.0, 47: 0.0, 48: 3582.0, 49: 0.0, 50: 734.0, 51: 0.0, 52: 627.0, 53: 70.0, 54: 2584.0, 55: 0.0, 56: 324.0, 57: 0.0, 58: 605.0, 59: 0.0, 60: 0.0, 61: 0.0, 62: 3989.0, 63: 10.0, 64: 42.0, 65: 0.0, 66: 904.0, 67: 0.0, 68: 88.0, 69: 70.0, 70: 8172.0, 71: 0.0, 72: 0.0, 73: 0.0, 74: 64902.0, 75: 0.0, 76: 347.0, 77: 0.0, 78: 36605.0, 79: 0.0, 80: 379.0, 81: 70.0, 82: 0.0, 83: 0.0, 84: 3001.0, 85: 0.0, 86: 1630.0, 87: 7.0, 88: 364.0, 89: 0.0, 90: 67404.0, 91: 9.0, 92: 0.0, 93: 0.0, 94: 7685.0, 95: 0.0, 96: 1017.0, 97: 0.0, 98: 2831.0, 99: 0.0, 100: 2963.0, 101: 0.0, 102: 854.0, 103: 0.0, 104: 0.0, 105: 0.0, 106: 0.0, 107: 0.0, 108: 0.0, 109: 0.0, 110: 0.0, 111: 0.0, 112: 0.0, 113: 0.0, 114: 0.0, 115: 0.0, 116: 0.0, 117: 0.0, 118: 0.0, 119: 0.0, 120: 0.0, 121: 0.0, 122: 0.0, 123: 0.0, 124: 0.0, 125: 0.0, 126: 67744.0, 127: 22.0, 128: 264.0, 129: 0.0, 260: 197.0, 268: 0.0, 265: 0.0, 269: 0.0, 261: 0.0, 266: 1198.0, 267: 0.0, 262: 2629.0, 258: 775.0, 257: 0.0, 263: 0.0, 259: 0.0, 264: 163.0, 250: 10326.0, 251: 0.0, 252: 1228.0, 253: 0.0, 254: 2769.0, 255: 0.0} # smoke test for the repr s = Series(ser) result = s.value_counts() str(result)
# __tz_countのPLOT__________________________ tz_counts[:10].plot(kind='barh',rot=0) import matplotlib.pyplot as plt # plt.show() # __要素数カウント__________________________ frame['a'][1] frame['a'][50] frame['a'][51] results=Series([x.split()[0] for x in frame.a.dropna()]) #.dropna() pandasメソッド 空白行を削除 引数で削除する行指定 #str.split(x) xを区切り文字にしてstrを分割してリストに収める #空白で区切った文字列をリストに収めて(リスト内法表記)、Seriesクラスでpandas dataframeにする results[:5] results.value_counts()[:8] #value_counts()で同じ要素の数を数える ## __要素数カウント(別の方法)__________________________ cframe=frame[frame.a.notnull()] #frameのa列のnullじゃない奴だけ集めた(cframe['a']==frame.a.dropna()) bool(map(list,[cframe['a'],frame.a.dropna()])) #list関数をcframe['a']とframe.a.dropna()に適用させて同じかどうか見る # __'Windows' or Not?__________________________ import numpy as np operating_system=np.where(cframe['a'].str.contains('Windows'),'Windows','Not Windows') #cframe['a']が'Windows'という文字を含む Trueで'Windows' falseで'Not Windows'を返す #` ['Windows' if 'Windows' in x else 'Not Windows' for x in cframe['a']]`と同じ operating_system[:5]
def get_overexpressed_genes(matrix: ExpMatrix, cell_labels: pd.Series, exp_thresh: float = 0.05, ignore_outliers: bool = True, num_genes: int = 20) -> pd.DataFrame: """Determine most over-expressed genes for each cluster.""" # make sure matrix and cell_labels are aligned matrix = matrix.loc[:, cell_labels.index] if ignore_outliers: # ignore the cluster named "Outliers", if it exists sel = (cell_labels != 'Outliers') matrix = matrix.loc[:, sel] cell_labels = cell_labels.loc[sel] _LOGGER.info('Ignoring mean expression values below %.3f', exp_thresh) data = [] # scale matrix matrix = matrix.scale() # determine fold-changes for all clusters vc = cell_labels.value_counts() clusters = vc.index.tolist() X = np.zeros((len(clusters), matrix.num_genes), dtype=np.float32) cluster_mean = ExpMatrix(genes=matrix.genes, cells=clusters, data=X.T) for l in clusters: sel = (cell_labels == l) cluster_mean.loc[:, l] = matrix.loc[:, sel].mean(axis=1) # in calculation of fold change, # ignore all expression values below exp_thresh thresh_cluster_mean = cluster_mean.copy() thresh_cluster_mean[thresh_cluster_mean < exp_thresh] = exp_thresh # calculate fold change relative to average of other clusters X = np.ones((len(clusters), matrix.num_genes), dtype=np.float32) fold_change = ExpMatrix(genes=matrix.genes, cells=clusters, data=X.T) for l in clusters: sel = (thresh_cluster_mean.cells != l) fold_change.loc[:, l] = thresh_cluster_mean.loc[:, l] / \ (thresh_cluster_mean.loc[:, sel].mean(axis=1)) markers = [] for l in clusters: change = fold_change.loc[:, l].sort_values(ascending=False) change = change[:num_genes] # scale mean expression values to 10K transcripts mean = cluster_mean.loc[change.index, l] mean = (10000 / cluster_mean.loc[:, l].sum()) * mean cluster_index = [l] * num_genes gene_index = change.index index = pd.MultiIndex.from_arrays([cluster_index, gene_index], names=['cluster', 'gene']) data = np.c_[change.values, mean.values] markers.append( pd.DataFrame(index=index, columns=['Fold change', 'Mean expression (TP10K)'], data=data)) markers = pd.concat(markers, axis=0) #markers = markers.swaplevel(0, 1).sort_index( # level=1, sort_remaining=False).swaplevel(0, 1) return markers
} f = DataFrame(dic,index=np.arange(100,80,-1)) f2 = DataFrame({'line03':np.linspace(30,35,10),'line04':np.arange(10)},index=np.arange(100,90,-1)); f3 = f.add(f2) # 求和 # 求每一列的和 sum = f.sum() # 求指定列的和 sum1 = f[['line01','line02']].sum() # 求每一行的和 sum2 = f.sum(axis=1) # print sum # print sum1 # print sum2 # 若该行或列有NaN则运行结果为NaN,默认skipna为True,忽略NaN sum4 = f3.sum(skipna=False) # print sum4 # 获取所有值 S2 = Series(['c','d','a','c','c','c','r','a','d']) uniques = S2.unique() # print uniques # 获取每个值出现的次数 uniques_counts = S2.value_counts() # print uniques_counts # 获取DataFrame中多个列出现的信息 result = f.apply(pd.value_counts).fillna(0) print result
# Create dataframe of records # Tabular 2-by-2 spreadsheet table frame = DataFrame(records) # Get histogram (counts) easily tz_counts = frame['tz'].value_counts() # Check output # print tz_counts[:10] # Data Munging (Clean the data) clean_tz = frame['tz'].fillna('Missing') clean_tz[clean_tz == ''] = 'Unknown' # Check output # print clean_tz.value_counts()[:10] # Getting a plot import matplotlib.pyplot as plt tz_counts[:10].plot(kind='barh', rot=0) # show all plots # plt.show() # PARSING DATA # Example of data below # u'GoogleMaps/RochesterNY' # u'Mozilla/5.0 (Windows NT 5.1; rv:10.0.2) Gecko/20100101 Firefox/10.0.2' # u'Mozilla/5.0 (Linux; U; Android 2.2.2; en-us; LG-P925/V10e Build/FRG83G) AppleWebKit/533.1 (KHTML, # nan # frame.a --> accesses key a of dataframe # .dropna() --> ignores values where nan # .split() --> results in a list of the values split by whitespaces results = Series([x.split()[0] for x in frame.a.dropna()]) # print results[:10] print results.value_counts()[:8]
countingsort(randint(0,100,100)) # # Segemente des Codes # In[457]: arr = [2,5,3,0,2,3,0,3] arr # In[458]: hist = Series(arr) hist sorted_hist = hist.value_counts().sort_index() # In[459]: sorted_hist # In[460]: cleaned_hist = Series(sorted_hist, index=range(max(sorted_hist.index)+1)) cleaned_hist # In[461]:
def is_categorical_column( data: pd.Series, valid_data: pd.Series, threshold: int = None, ratio: Optional[float] = None, oov_ratio_threshold: Optional[float] = None, is_label: bool = False, ) -> bool: """ Identify whether a column is one categorical column. If the number of unique elements in the column is smaller than min(#Total Sample * ratio, threshold), it will be treated as a categorical column. Parameters ---------- data One column of a multimodal pd.DataFrame for training. valid_data One column of a multimodal pd.DataFrame for validation. threshold The threshold for detecting categorical column. ratio The ratio detecting categorical column. oov_ratio_threshold The out-of-vocabulary ratio between training and validation. This is used to determine if the column is a categorical column. Usually, a categorical column can tolerate a small OOV ratio. is_label Whether the column is a label column. Returns ------- Whether the column is a categorical column. """ if data.dtype.name == "category": return True else: if threshold is None: if is_label: threshold = 100 oov_ratio_threshold = 0 ratio = 0.1 else: threshold = 20 oov_ratio_threshold = 0 ratio = 0.1 threshold = min(int(len(data) * ratio), threshold) data_value_counts = data.value_counts(dropna=False) key_set = set(data_value_counts.keys()) if len(data_value_counts) < threshold: valid_value_counts = valid_data.value_counts(dropna=False) total_valid_num = len(valid_data) oov_num = 0 for k, v in zip(valid_value_counts.keys(), valid_value_counts.values): if k not in key_set: oov_num += v if is_label and oov_num != 0: return False if oov_num / total_valid_num > oov_ratio_threshold: return False return True return False
def countingsort(arr): #Umwandlung in Pandas.Series für Histogrammbildung hist = Series(arr) sns.distplot(hist,kde=False,rug=True,color='royalblue', bins=max(hist.index)*3, label=r'Häufigkeit') plt.ylim(0, max(hist.index)+1) plt.xlabel('Element') plt.ylabel(r'Häufigkeit') plt.title(r'$\mathrm{Array\ Histogram}$') plt.show() #Sortiertes Histogramm, fehlende Werte (NaN) werden durch 0 ersetzt sorted_hist = hist.value_counts().sort_index() cleaned_hist = Series(sorted_hist, index=range(max(sorted_hist.index)+1)).fillna(0) #Aufsummierung der Werte im Histogramm summed_hist = Series(cleaned_hist.cumsum()[:-1].values, index=range(1, max(arr)+1)) #Bereinigung des summierten Histogramms summed_hist_cleaned = Series(summed_hist, index=range(max(arr)+1)).fillna(0) #Kreiere DataFrames zu A, B und Hilfsarray C #DataFrame A rows = len(arr) #Anzahl Reihen columns_A = [] #Anzahl Spalten #Benenne die Spalten für A for num in range(rows): columns_A.append('A[' + str(num) + ']') #Kreiere DataFrame dframe_A = DataFrame(np.array(list(arr)*rows).reshape(rows, rows), columns=columns_A, index=range(rows)) #Das Gleiche nun für Hilfsarray C hilfs_array = np.array(summed_hist_cleaned.values) columns = len(hilfs_array) columns_C = [] for num in range(columns): columns_C.append('C[' + str(num) + ']') dframe_C = DataFrame(np.array(list(hilfs_array)*rows).reshape(rows ,columns), index=range(rows), columns=columns_C) #Fertige zunächst LEERES DataFrame B an columns_B = [] for num in range(rows): columns_B.append('B[' + str(num) + ']') dframe_B = DataFrame(np.nan, index=range(rows),columns=columns_B).fillna(' ') #Kreiere Dict, in dem Keys und Values für das später fertig sortierte Array B angelegt werden b = {} lookup_value = 0 for i in range(rows): #Iteration der Werte in C sobald in A nachgeschlagen if i > 0: dframe_C['C['+str(lookup_value)+']'][i:] += 1 #Wert, der in C nachgeschlagen, und in B an Stelle C[A[i]] eingefügt werden soll lookup_value = dframe_A.values[i][i] key = 'B[' + str(int(dframe_C.values[i][lookup_value])) + ']' b[key] = [lookup_value,i] #Füge Werte schließlich sortiert in B ein for key, value in b.items(): dframe_B[key][value[1]:] = value[0] #Konkatenieren der 3 DataFrames final_dframe = pd.concat([dframe_A, dframe_C, dframe_B], axis=1) #Sortiertes Array B result = [] for i in range(len(b)): result.append(b['B[' + str(i) + ']'][0]) #print('\nDataFrame A\n') #display(dframe_A) #print('\nDataFrame C\n') #display(dframe_C) #print('\nDataFrame B\n') #display(dframe_B) #display(final_dframe) return result
def test_value_counts_bins(self, klass): s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a'] s = klass(s_values) # bins with pytest.raises(TypeError): s.value_counts(bins=1) s1 = Series([1, 1, 2, 3]) res1 = s1.value_counts(bins=1) exp1 = Series({Interval(0.997, 3.0): 4}) tm.assert_series_equal(res1, exp1) res1n = s1.value_counts(bins=1, normalize=True) exp1n = Series({Interval(0.997, 3.0): 1.0}) tm.assert_series_equal(res1n, exp1n) if isinstance(s1, Index): tm.assert_index_equal(s1.unique(), Index([1, 2, 3])) else: exp = np.array([1, 2, 3], dtype=np.int64) tm.assert_numpy_array_equal(s1.unique(), exp) assert s1.nunique() == 3 # these return the same res4 = s1.value_counts(bins=4, dropna=True) intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0]) exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2])) tm.assert_series_equal(res4, exp4) res4 = s1.value_counts(bins=4, dropna=False) intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0]) exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2])) tm.assert_series_equal(res4, exp4) res4n = s1.value_counts(bins=4, normalize=True) exp4n = Series([0.5, 0.25, 0.25, 0], index=intervals.take([0, 3, 1, 2])) tm.assert_series_equal(res4n, exp4n) # handle NA's properly s_values = [ 'a', 'b', 'b', 'b', np.nan, np.nan, 'd', 'd', 'a', 'a', 'b' ] s = klass(s_values) expected = Series([4, 3, 2], index=['b', 'a', 'd']) tm.assert_series_equal(s.value_counts(), expected) if isinstance(s, Index): exp = Index(['a', 'b', np.nan, 'd']) tm.assert_index_equal(s.unique(), exp) else: exp = np.array(['a', 'b', np.nan, 'd'], dtype=object) tm.assert_numpy_array_equal(s.unique(), exp) assert s.nunique() == 3 s = klass({}) expected = Series([], dtype=np.int64) tm.assert_series_equal(s.value_counts(), expected, check_index_type=False) # returned dtype differs depending on original if isinstance(s, Index): tm.assert_index_equal(s.unique(), Index([]), exact=False) else: tm.assert_numpy_array_equal(s.unique(), np.array([]), check_dtype=False) assert s.nunique() == 0
# print cy_counts[:20] # print l_counts[:20] clean_tz = frame['tz'].fillna('Missing') clean_tz[clean_tz == ''] = 'TZ Unknown' tz_counts = clean_tz.value_counts() print tz_counts[:10] tz_counts[:10].plot(kind='barh', rot=0) results = Series([x.split()[0] for x in frame.a.dropna()]) print results.value_counts()[:12] cframe = frame[frame.a.notnull()] operating_system = np.where(cframe['a'].str.contains('Windows'), 'Windows', 'Not Windows') print operating_system[:10] by_tz_os = cframe.groupby(['tz', operating_system]) agg_counts = by_tz_os.size().unstack().fillna(0) print agg_counts[:10] indexer = agg_counts.sum(1).argsort() print agg_counts[:20]
def test_value_counts_inferred(self): klasses = [Index, Series] for klass in klasses: s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a'] s = klass(s_values) expected = Series([4, 3, 2, 1], index=['b', 'a', 'd', 'c']) tm.assert_series_equal(s.value_counts(), expected) self.assert_numpy_array_equal(s.unique(), np.unique(s_values)) self.assertEquals(s.nunique(), 4) # don't sort, have to sort after the fact as not sorting is platform-dep hist = s.value_counts(sort=False) hist.sort() expected = Series([3, 1, 4, 2], index=list('acbd')) expected.sort() tm.assert_series_equal(hist, expected) # sort ascending hist = s.value_counts(ascending=True) expected = Series([1, 2, 3, 4], index=list('cdab')) tm.assert_series_equal(hist, expected) # relative histogram. hist = s.value_counts(normalize=True) expected = Series([.4, .3, .2, .1], index=['b', 'a', 'd', 'c']) tm.assert_series_equal(hist, expected) # bins self.assertRaises(TypeError, lambda bins: s.value_counts(bins=bins), 1) s1 = Series([1, 1, 2, 3]) res1 = s1.value_counts(bins=1) exp1 = Series({0.998: 4}) tm.assert_series_equal(res1, exp1) res1n = s1.value_counts(bins=1, normalize=True) exp1n = Series({0.998: 1.0}) tm.assert_series_equal(res1n, exp1n) self.assert_numpy_array_equal(s1.unique(), np.array([1, 2, 3])) self.assertEquals(s1.nunique(), 3) res4 = s1.value_counts(bins=4) exp4 = Series({0.998: 2, 1.5: 1, 2.0: 0, 2.5: 1}, index=[0.998, 2.5, 1.5, 2.0]) tm.assert_series_equal(res4, exp4) res4n = s1.value_counts(bins=4, normalize=True) exp4n = Series({0.998: 0.5, 1.5: 0.25, 2.0: 0.0, 2.5: 0.25}, index=[0.998, 2.5, 1.5, 2.0]) tm.assert_series_equal(res4n, exp4n) # handle NA's properly s_values = ['a', 'b', 'b', 'b', np.nan, np.nan, 'd', 'd', 'a', 'a', 'b'] s = klass(s_values) expected = Series([4, 3, 2], index=['b', 'a', 'd']) tm.assert_series_equal(s.value_counts(), expected) self.assert_numpy_array_equal(s.unique(), np.array(['a', 'b', np.nan, 'd'], dtype='O')) self.assertEquals(s.nunique(), 3) s = klass({}) expected = Series([], dtype=np.int64) tm.assert_series_equal(s.value_counts(), expected) self.assert_numpy_array_equal(s.unique(), np.array([])) self.assertEquals(s.nunique(), 0) # GH 3002, datetime64[ns] txt = "\n".join(['xxyyzz20100101PIE', 'xxyyzz20100101GUM', 'xxyyzz20100101EGG', 'xxyyww20090101EGG', 'foofoo20080909PIE', 'foofoo20080909GUM']) f = StringIO(txt) df = pd.read_fwf(f, widths=[6, 8, 3], names=["person_id", "dt", "food"], parse_dates=["dt"]) s = klass(df['dt'].copy()) idx = pd.to_datetime(['2010-01-01 00:00:00Z', '2008-09-09 00:00:00Z', '2009-01-01 00:00:00X']) expected_s = Series([3, 2, 1], index=idx) tm.assert_series_equal(s.value_counts(), expected_s) expected = np.array(['2010-01-01 00:00:00Z', '2009-01-01 00:00:00Z', '2008-09-09 00:00:00Z'], dtype='datetime64[ns]') if isinstance(s, DatetimeIndex): expected = DatetimeIndex(expected) self.assert_(s.unique().equals(expected)) else: self.assert_numpy_array_equal(s.unique(), expected) self.assertEquals(s.nunique(), 3) # with NaT s = df['dt'].copy() s = klass([v for v in s.values] + [pd.NaT]) result = s.value_counts() self.assertEqual(result.index.dtype, 'datetime64[ns]') expected_s[pd.NaT] = 1 tm.assert_series_equal(result, expected_s) unique = s.unique() self.assertEqual(unique.dtype, 'datetime64[ns]') # numpy_array_equal cannot compare pd.NaT self.assert_numpy_array_equal(unique[:3], expected) self.assertTrue(unique[3] is pd.NaT or unique[3].astype('int64') == pd.tslib.iNaT) self.assertEquals(s.nunique(), 4) # timedelta64[ns] td = df.dt - df.dt + timedelta(1) td = klass(td) result = td.value_counts() expected_s = Series([6], index=[86400000000000]) self.assertEqual(result.index.dtype, 'int64') tm.assert_series_equal(result, expected_s) # get nanoseconds to compare expected = np.array([86400000000000]) self.assert_numpy_array_equal(td.unique(), expected) self.assertEquals(td.nunique(), 1) td2 = timedelta(1) + (df.dt - df.dt) td2 = klass(td2) result2 = td2.value_counts() self.assertEqual(result2.index.dtype, 'int64') tm.assert_series_equal(result2, expected_s) self.assert_numpy_array_equal(td.unique(), expected) self.assertEquals(td.nunique(), 1)
from pandas import DataFrame, Series import pandas as pd; import numpy as np frame = DataFrame(records) frame #空欄、存在していない箇所⇒fillnaメソッドで'Missing'の文字列で置換 #存在するが中身が空文字列⇒真偽値の配列によるインデックス参照を使って'Unknown'に置換 clean_tz = frame['tz'].fillna('Missing') clean_tz[clean_tz == ''] = 'Unknown' tz_counts = clean_tz.value_counts() tz_counts[:10] #ユーザーエージェントの先頭トークンを切り出して表示する results = Series([x.split()[0] for x in frame.a.dropna()]) results[:5] results.value_counts()[:8] #上位8位のカウントを表示 #Windowsユーザと非Windowsユーザを分類 #Windowsユーザ分類条件:UAに'Windows'の文字列が含まれるか #frmaeオブジェクトを基にしてUAが存在しないレコードを除外 cframe = frame[frame.a.notnull()] #numpy.where()を使用 operating_system = np.where(cframe['a'].str.contains('Windows'),'Windows','Not Windows') operating_system[:5] #タイムゾーンと稼働OSの組み合わせごとにグループ化する #pandas.DataFrame.unstack()を使う by_tz_os = cframe.groupby(['tz', operating_system]) agg_counts = by_tz_os.size().unstack().fillna(0) #昇順のソートを使う
def test_value_counts_bins(self): klasses = [Index, Series] for klass in klasses: s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a'] s = klass(s_values) # bins pytest.raises(TypeError, lambda bins: s.value_counts(bins=bins), 1) s1 = Series([1, 1, 2, 3]) res1 = s1.value_counts(bins=1) exp1 = Series({Interval(0.997, 3.0): 4}) tm.assert_series_equal(res1, exp1) res1n = s1.value_counts(bins=1, normalize=True) exp1n = Series({Interval(0.997, 3.0): 1.0}) tm.assert_series_equal(res1n, exp1n) if isinstance(s1, Index): tm.assert_index_equal(s1.unique(), Index([1, 2, 3])) else: exp = np.array([1, 2, 3], dtype=np.int64) tm.assert_numpy_array_equal(s1.unique(), exp) assert s1.nunique() == 3 # these return the same res4 = s1.value_counts(bins=4, dropna=True) intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0]) exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2])) tm.assert_series_equal(res4, exp4) res4 = s1.value_counts(bins=4, dropna=False) intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0]) exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2])) tm.assert_series_equal(res4, exp4) res4n = s1.value_counts(bins=4, normalize=True) exp4n = Series([0.5, 0.25, 0.25, 0], index=intervals.take([0, 3, 1, 2])) tm.assert_series_equal(res4n, exp4n) # handle NA's properly s_values = ['a', 'b', 'b', 'b', np.nan, np.nan, 'd', 'd', 'a', 'a', 'b'] s = klass(s_values) expected = Series([4, 3, 2], index=['b', 'a', 'd']) tm.assert_series_equal(s.value_counts(), expected) if isinstance(s, Index): exp = Index(['a', 'b', np.nan, 'd']) tm.assert_index_equal(s.unique(), exp) else: exp = np.array(['a', 'b', np.nan, 'd'], dtype=object) tm.assert_numpy_array_equal(s.unique(), exp) assert s.nunique() == 3 s = klass({}) expected = Series([], dtype=np.int64) tm.assert_series_equal(s.value_counts(), expected, check_index_type=False) # returned dtype differs depending on original if isinstance(s, Index): tm.assert_index_equal(s.unique(), Index([]), exact=False) else: tm.assert_numpy_array_equal(s.unique(), np.array([]), check_dtype=False) assert s.nunique() == 0
iris.target # In[178]: len(iris.target) # In[179]: Y=Series(iris.target) # In[180]: Y.value_counts() ##### Clearly there are 3 groups of dependent variable values .Now we will try to depict these values graphically # In[181]: iris.data.shape ##### For each of the dependent variable in Y , we have 4 independent variables - we will use 2 of the independent variables and try to plot the points on a plot # In[182]: X=DataFrame(iris.data[:,0:2])
# TODO: Use the interquartile range to calculate an outlier step (1.5 times the interquartile range) step = 1.5*(Q3 - Q1) print "step = %1.2f" % step # Display the outliers print "Data points considered outliers for the feature '{}':".format(feature) df_outlier = log_data[~((log_data[feature] >= Q1 - step) & (log_data[feature] <= Q3 + step))] #display(df_outlier) # switching this off for better graphical output print "Switching this off for better graphical output\n" # OPTIONAL: Select the indices for data points you wish to remove for i in df_outlier.index: out_liers.append(i) s = Series(out_liers) # convert outliers into a Series object s_vc = s.value_counts() # use value_counts method to group by same outlier index valid = [i for i in range(log_data.shape[0]) if not(i in s_vc[s_vc>1])] # keep all indices that have at most 1 outlier feature # Remove the outliers, if any were specified # good_data = log_data.drop(log_data.index[outliers]).reset_index(drop = True) good_data = log_data.ix[valid,:] print "Potential outliers = ", len(s_vc) print "'True' outliers" print s_vc[s_vc>1] print "Original data = ", log_data.shape[0] print "Data without outliers = ", good_data.shape[0]
'The Man with the Twisted Lip', 'The Adventure of the Blue Carbuncle', 'The Adventure of the Speckled Band', "The Adventure of the Engineer's Thumb", 'The Adventure of the Noble Bachelor', 'The Adventure of the Beryl Coronet', 'The Adventure of the Copper Beeches' ] titles = [title.lower() for title in titles] shortStoryCounts = [] for i, title in enumerate(titles): shortStory = sherlockTexts[title] tokenizedStory = nltk.word_tokenize(shortStory) tokenizedStory = [word for word in tokenizedStory if word.isalnum()] tokenSeries = Series(tokenizedStory) shortStoryCounts.append(tokenSeries.value_counts()) df = pd.concat(shortStoryCounts, axis=1, sort=False) dtm = df.T # First lets get the term frequencies. These are just the raw terms in the dtm # divided by the length of the document. documentLengths = dtm.sum(axis=1) # Add up the word count for all the words! frequencyDtm = dtm.div(documentLengths, axis='index') # Replace NaN values with 0 (otherwise the math won't work) frequencyDtm = frequencyDtm.fillna(0) # Get a Series which tells you how many Documents have the term docsWithTerm = dtm.count() # Get the weight of the term (total number of documents divided by number of