def wordify(abs_list, min_word_len=2): ''' Convert the abstract field from PLoS API data to a filtered list of words. ''' # The abstract field is a list. Make it a string. text = ' '.join(abs_list).strip(' \n\t') if text == '': return nan else: # Remove punctuation & replace with space, # because we want 'metal-contaminated' => 'metal contaminated' # ...not 'metalcontaminated', and so on. for c in string.punctuation: text = text.replace(c, ' ') # Now make it a Series of words, and do some cleaning. words = Series(text.split(' ')) words = words.str.lower() # Filter out words less than minimum word length. words = words[words.str.len() >= min_word_len] words = words[~words.str.contains(r'[^#@a-z]')] # What exactly does this do? # Filter out globally-defined stopwords ignore = stops & set(words.unique()) words_out = [w for w in words.tolist() if w not in ignore] return words_out
def pd_01(): obj=Series(['c','a','d','a','a','b','c']) uniques=obj.unique() print uniques print uniques.sort() print pd.value_counts(obj,sort=False) mask=obj.isin(['b','c']) print mask print obj[mask]
def test_value_counts_bins(self): klasses = [Index, Series] for klass in klasses: s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a'] s = klass(s_values) # bins self.assertRaises(TypeError, lambda bins: s.value_counts(bins=bins), 1) s1 = Series([1, 1, 2, 3]) res1 = s1.value_counts(bins=1) exp1 = Series({0.998: 4}) tm.assert_series_equal(res1, exp1) res1n = s1.value_counts(bins=1, normalize=True) exp1n = Series({0.998: 1.0}) tm.assert_series_equal(res1n, exp1n) self.assert_numpy_array_equal(s1.unique(), np.array([1, 2, 3], dtype=np.int64)) self.assertEqual(s1.nunique(), 3) res4 = s1.value_counts(bins=4) exp4 = Series({0.998: 2, 1.5: 1, 2.0: 0, 2.5: 1}, index=[0.998, 2.5, 1.5, 2.0]) tm.assert_series_equal(res4, exp4) res4n = s1.value_counts(bins=4, normalize=True) exp4n = Series( {0.998: 0.5, 1.5: 0.25, 2.0: 0.0, 2.5: 0.25}, index=[0.998, 2.5, 1.5, 2.0]) tm.assert_series_equal(res4n, exp4n) # handle NA's properly s_values = ['a', 'b', 'b', 'b', np.nan, np.nan, 'd', 'd', 'a', 'a', 'b'] s = klass(s_values) expected = Series([4, 3, 2], index=['b', 'a', 'd']) tm.assert_series_equal(s.value_counts(), expected) exp = np.array(['a', 'b', np.nan, 'd'], dtype=np.object_) self.assert_numpy_array_equal(s.unique(), exp) self.assertEqual(s.nunique(), 3) s = klass({}) expected = Series([], dtype=np.int64) tm.assert_series_equal(s.value_counts(), expected, check_index_type=False) # returned dtype differs depending on original self.assert_numpy_array_equal(s.unique(), np.array([]), check_dtype=False) self.assertEqual(s.nunique(), 0)
def test_unique(): # GH714 also, dtype=float s = Series([1.2345] * 100) s[::2] = np.nan result = s.unique() assert len(result) == 2 s = Series([1.2345] * 100, dtype='f4') s[::2] = np.nan result = s.unique() assert len(result) == 2 # NAs in object arrays #714 s = Series(['foo'] * 100, dtype='O') s[::2] = np.nan result = s.unique() assert len(result) == 2 # decision about None s = Series([1, 2, 3, None, None, None], dtype=object) result = s.unique() expected = np.array([1, 2, 3, None], dtype=object) tm.assert_numpy_array_equal(result, expected) # GH 18051 s = Series(Categorical([])) tm.assert_categorical_equal(s.unique(), Categorical([]), check_dtype=False) s = Series(Categorical([np.nan])) tm.assert_categorical_equal(s.unique(), Categorical([np.nan]), check_dtype=False)
def _process_target_or_features_for_plotting(target_or_features, type_, plot_std_max): if isinstance(target_or_features, Series): is_target = True elif isinstance(target_or_features, DataFrame): is_target = False else: raise ValueError( 'target_or_features ({}) is neither a Series or DataFrame.'.format( type(target_or_features))) if type_ == 'continuous': if is_target: target_or_features = Series(normalize_nd_array( target_or_features.values, '-0-', None, raise_for_bad_value=False), name=target_or_features.name, index=target_or_features.index) else: target_or_features = DataFrame( normalize_nd_array(target_or_features.values, '-0-', 1, raise_for_bad_value=False), index=target_or_features.index, columns=target_or_features.columns) plot_min = max(-plot_std_max, nanmin(target_or_features.values)) plot_max = min(plot_std_max, nanmax(target_or_features.values)) colorscale = CONTINUOUS_COLORSCALE_FOR_MATCH else: plot_min = 0 if type_ == 'categorical': if is_target: plot_max = target_or_features.unique().size - 1 else: plot_max = target_or_features.unstack().unique().size - 1 colorscale = make_colorscale(colors=CATEGORICAL_COLORS) elif type_ == 'binary': plot_max = 1 colorscale = make_colorscale(colors=BINARY_COLORS_WHITE_BLACK) else: raise ValueError('Unknown type_: {}.'.format(type_)) return target_or_features, plot_min, plot_max, colorscale
def get_problem_type(y: Series): """ Identifies which type of prediction problem we are interested in (if user has not specified). Ie. binary classification, multi-class classification, or regression. """ if len(y) == 0: raise ValueError("provided labels cannot have length = 0") y = y.dropna( ) # Remove missing values from y (there should not be any though as they were removed in Learner.general_data_processing()) num_rows = len(y) unique_values = y.unique() unique_count = len(unique_values) if unique_count > 10: logger.log( 20, f'Here are the first 10 unique label values in your data: {list(unique_values[:10])}' ) else: logger.log( 20, f'Here are the {unique_count} unique label values in your data: {list(unique_values)}' ) MULTICLASS_LIMIT = 1000 # if numeric and class count would be above this amount, assume it is regression if num_rows > 1000: REGRESS_THRESHOLD = 0.05 # if the unique-ratio is less than this, we assume multiclass classification, even when labels are integers else: REGRESS_THRESHOLD = 0.1 if unique_count == 2: problem_type = BINARY reason = "only two unique label-values observed" elif unique_values.dtype == 'object': problem_type = MULTICLASS reason = "dtype of label-column == object" elif np.issubdtype(unique_values.dtype, np.floating): unique_ratio = unique_count / float(num_rows) if (unique_ratio <= REGRESS_THRESHOLD) and (unique_count <= MULTICLASS_LIMIT): try: can_convert_to_int = np.array_equal(y, y.astype(int)) if can_convert_to_int: problem_type = MULTICLASS reason = "dtype of label-column == float, but few unique label-values observed and label-values can be converted to int" else: problem_type = REGRESSION reason = "dtype of label-column == float and label-values can't be converted to int" except: problem_type = REGRESSION reason = "dtype of label-column == float and label-values can't be converted to int" else: problem_type = REGRESSION reason = "dtype of label-column == float and many unique label-values observed" elif np.issubdtype(unique_values.dtype, np.integer): unique_ratio = unique_count / float(num_rows) if (unique_ratio <= REGRESS_THRESHOLD) and (unique_count <= MULTICLASS_LIMIT): problem_type = MULTICLASS # TODO: Check if integers are from 0 to n-1 for n unique values, if they have a wide spread, it could still be regression reason = "dtype of label-column == int, but few unique label-values observed" else: problem_type = REGRESSION reason = "dtype of label-column == int and many unique label-values observed" else: raise NotImplementedError('label dtype', unique_values.dtype, 'not supported!') logger.log( 25, f"AutoGluon infers your prediction problem is: {problem_type} (because {reason})." ) logger.log( 25, f"If this is wrong, please specify `problem_type` argument in fit() instead " f"(You may specify problem_type as one of: {[BINARY, MULTICLASS, REGRESSION]})\n" ) return problem_type
def test_value_counts_bins(self): klasses = [Index, Series] for klass in klasses: s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a'] s = klass(s_values) # bins self.assertRaises(TypeError, lambda bins: s.value_counts(bins=bins), 1) s1 = Series([1, 1, 2, 3]) res1 = s1.value_counts(bins=1) exp1 = Series({0.998: 4}) tm.assert_series_equal(res1, exp1) res1n = s1.value_counts(bins=1, normalize=True) exp1n = Series({0.998: 1.0}) tm.assert_series_equal(res1n, exp1n) if isinstance(s1, Index): tm.assert_index_equal(s1.unique(), Index([1, 2, 3])) else: exp = np.array([1, 2, 3], dtype=np.int64) tm.assert_numpy_array_equal(s1.unique(), exp) self.assertEqual(s1.nunique(), 3) res4 = s1.value_counts(bins=4) exp4 = Series({ 0.998: 2, 1.5: 1, 2.0: 0, 2.5: 1 }, index=[0.998, 2.5, 1.5, 2.0]) tm.assert_series_equal(res4, exp4) res4n = s1.value_counts(bins=4, normalize=True) exp4n = Series({ 0.998: 0.5, 1.5: 0.25, 2.0: 0.0, 2.5: 0.25 }, index=[0.998, 2.5, 1.5, 2.0]) tm.assert_series_equal(res4n, exp4n) # handle NA's properly s_values = [ 'a', 'b', 'b', 'b', np.nan, np.nan, 'd', 'd', 'a', 'a', 'b' ] s = klass(s_values) expected = Series([4, 3, 2], index=['b', 'a', 'd']) tm.assert_series_equal(s.value_counts(), expected) if isinstance(s, Index): exp = Index(['a', 'b', np.nan, 'd']) tm.assert_index_equal(s.unique(), exp) else: exp = np.array(['a', 'b', np.nan, 'd'], dtype=object) tm.assert_numpy_array_equal(s.unique(), exp) self.assertEqual(s.nunique(), 3) s = klass({}) expected = Series([], dtype=np.int64) tm.assert_series_equal(s.value_counts(), expected, check_index_type=False) # returned dtype differs depending on original if isinstance(s, Index): self.assert_index_equal(s.unique(), Index([]), exact=False) else: self.assert_numpy_array_equal(s.unique(), np.array([]), check_dtype=False) self.assertEqual(s.nunique(), 0)
# 전체 행이나 column의 값이 NaN이 아니라면 NaN 값은 제외시키고 계산을 하는데 # skipna옵션은 전체 row나 column의 값이 NaN이 아니라도 제외시키지 않을 수 있다. # skipna의 default는 True print(df.sum(axis=1, skipna=False)) # 하나라도 NaN이 있으면 계산 x # idxmin, idxmax와 같은 ㅅ메서드는 최소, 최댓값을 가지고 있는 색인 값 같은 간접 통계를 반환한다. print(df.idxmax()) print(df.idxmin()) # 누산 메서드: cumsum() print(df.cumsum()) # 유일한 값, 도수 메서드 s1 = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c']) unique = s1.unique() print(unique) # 중복된 값을 없애는 메서드 print(unique.sort()) # sort 못함 cnt = s1.value_counts() # 값의 수를 계산(도수), 반환값은 Series 객체 print(cnt) # 내림차순으로 정렬되며 같은 값은 먼저 나온 순서 # isin 메서드는 어떤 값이 Series에 있는지 나타내는 메서드 # boolean 값(Treu, False)을 반환한다. # DataFrame, Series에서 원하는 값을 골라내고 싶을 때 유용하게 사용하는 메서드 mask = s1.isin(['b', 'c']) print(mask) # b 또는 c 가 있으면 true값 반환 print(s1[mask]) # mask를 씌워 원하는 값만 보여주는 것으로 활용 가능 data = DataFrame({ 'Q1': [1, 3, 4, 3, 4],
# 描述统计 df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index=['a', 'b', 'c', 'd'], columns=['one', 'two']) df df.sum() df.sum(axis=1) df.mean(axis=1, skipna=False) df.idxmax() df.describe() # corr cov corrwith # 唯一值、值计数,成员资格 obj = Series(list('cadaabbcc')) uniques = obj.unique() uniques uniques.sort() obj.value_counts() pd.value_counts(obj.values, sort=False) mask = obj.isin(['b', 'c']) mask obj[mask] # 处理丢失数据 dates = pd.date_range('20170101', periods=6) df = DataFrame(np.arange(24).reshape((6, 4)), index=dates, columns=['a', 'b', 'c', 'd']) df.iloc[0, 1] = np.nan
def excelFromPictures(path,picture): SecretId = "" SecretKey = "" with open(picture,"rb") as f: img_data = f.read() img_base64 = b64encode(img_data) cred = credential.Credential(SecretId, SecretKey) #ID和Secret从腾讯云申请 httpProfile = HttpProfile() httpProfile.endpoint = "ocr.tencentcloudapi.com" clientProfile = ClientProfile() clientProfile.httpProfile = httpProfile client = ocr_client.OcrClient(cred, "ap-shanghai", clientProfile) req = models.TableOCRRequest() params = '{"ImageBase64":"' + str(img_base64, 'utf-8') + '"}' req.from_json_string(params) # false=0 try: resp = client.TableOCR(req) # print(resp.to_json_string()) except TencentCloudSDKException as err: print("错误[",err,"]\n可重试") ##提取识别出的数据,并且生成json result1 = loads(resp.to_json_string()) #RowTl表示数据所有行索引,ColTl表示数据所在列索引,Text为数据 rowIndex = [] colIndex = [] content = [] for item in result1['TextDetections']: rowIndex.append(item['RowTl']) colIndex.append(item['ColTl']) content.append(item['Text']) ##导出Excel ##ExcelWriter方案 rowIndex = Series(rowIndex) colIndex = Series(colIndex) index = rowIndex.unique() index.sort() columns = colIndex.unique() columns.sort() data = DataFrame(index = index, columns = columns) for i in range(len(rowIndex)): data.loc[rowIndex[i],colIndex[i]] = re.sub(" ","",content[i]) writer = ExcelWriter(path+"/tables/" +re.match(".*\.",f.name).group()+"xlsx", engine='xlsxwriter') data.to_excel(writer,sheet_name = 'Sheet1', index=False,header = False) writer.save() print("已经完成" + f.name + "的提取")
# ### 값 추출 # #### Series # #### unique # In[79]: f = Series(list("가나라다다나라다")) f # In[80]: f.unique() # In[81]: tmp = f.unique() print("정렬 전", tmp) tmp.sort() print("정렬 후", tmp) # In[82]: f.duplicated()
from pandas import DataFrame,Series import numpy as np import pandas as pd import matplotlib.pyplot as plt y_df2 = pd.read_csv('data/759funds.csv',index_col=0) fund_names2 = Series(y_df2.columns) fund_names = fund_names2.unique() ind = y_df2.index # funds = {} # for name in fund_names: # funds[name] = [] # f = open('all_output/fund_'+name+'_beta.txt') # cols=-1 # for line in f.readlines(): # if line[0]=='b': # funds[name].append([]) # cols=cols+1 # else: # funds[name][cols].append(float(line)) # f.close() alphas = {} for name in fund_names: alphas[name]=[] f = open('all_output/fund_'+name+'_alpha_median_ma6.txt') for line in f.readlines(): alphas[name].append(float(line)) f.close()
def test_value_counts_bins(self): klasses = [Index, Series] for klass in klasses: s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a'] s = klass(s_values) # bins pytest.raises(TypeError, lambda bins: s.value_counts(bins=bins), 1) s1 = Series([1, 1, 2, 3]) res1 = s1.value_counts(bins=1) exp1 = Series({Interval(0.997, 3.0): 4}) tm.assert_series_equal(res1, exp1) res1n = s1.value_counts(bins=1, normalize=True) exp1n = Series({Interval(0.997, 3.0): 1.0}) tm.assert_series_equal(res1n, exp1n) if isinstance(s1, Index): tm.assert_index_equal(s1.unique(), Index([1, 2, 3])) else: exp = np.array([1, 2, 3], dtype=np.int64) tm.assert_numpy_array_equal(s1.unique(), exp) assert s1.nunique() == 3 # these return the same res4 = s1.value_counts(bins=4, dropna=True) intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0]) exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2])) tm.assert_series_equal(res4, exp4) res4 = s1.value_counts(bins=4, dropna=False) intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0]) exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2])) tm.assert_series_equal(res4, exp4) res4n = s1.value_counts(bins=4, normalize=True) exp4n = Series([0.5, 0.25, 0.25, 0], index=intervals.take([0, 3, 1, 2])) tm.assert_series_equal(res4n, exp4n) # handle NA's properly s_values = ['a', 'b', 'b', 'b', np.nan, np.nan, 'd', 'd', 'a', 'a', 'b'] s = klass(s_values) expected = Series([4, 3, 2], index=['b', 'a', 'd']) tm.assert_series_equal(s.value_counts(), expected) if isinstance(s, Index): exp = Index(['a', 'b', np.nan, 'd']) tm.assert_index_equal(s.unique(), exp) else: exp = np.array(['a', 'b', np.nan, 'd'], dtype=object) tm.assert_numpy_array_equal(s.unique(), exp) assert s.nunique() == 3 s = klass({}) expected = Series([], dtype=np.int64) tm.assert_series_equal(s.value_counts(), expected, check_index_type=False) # returned dtype differs depending on original if isinstance(s, Index): tm.assert_index_equal(s.unique(), Index([]), exact=False) else: tm.assert_numpy_array_equal(s.unique(), np.array([]), check_dtype=False) assert s.nunique() == 0
["CVX", "XOM", "BP"], start=datetime.datetime(2010, 1, 1), end=datetime.datetime(2013, 1, 1) )["Adj Close"] prices.head() volume = pdweb.get_data_yahoo( ["CVX", "XOM", "BP"], start=datetime.datetime(2010, 1, 1), end=datetime.datetime(2013, 1, 1) )["Volume"] volume.head() rets = prices.pct_change() # Correction of the stocks rcorr = rets.corr prices.plot() volume.plot() import seaborn as sns import matplotlib.pyplot as plt # seaborn correlation plot between pct change in stock price sns.corrplot(rets, annot=False, diag_names=False) prices.cov # covariance method # unique values of a series ser1 = Series(["w", "w", "x", "y", "z", "w", "x", "y", "x", "a"]) ser1.unique() ser1.value_counts()
def fit(self, X: pd.DataFrame, y: pd.Series): """ Learns the numbers that should be used to replace the categories in each variable. That is the WoE. Parameters ---------- X : pandas dataframe of shape = [n_samples, n_features] The training input samples. Can be the entire dataframe, not just the categorical variables. y : pandas series. Target, must be binary [0,1]. Attributes ---------- encoder_dict_: dictionary The dictionary containing the {category: WoE} pairs per variable. """ X = self._check_fit_input_and_variables(X) # check that y is binary if any(x for x in y.unique() if x not in [0, 1]): raise ValueError( "This encoder is only designed for binary classification, values of y " "can be only 0 or 1.") temp = pd.concat([X, y], axis=1) temp.columns = list(X.columns) + ["target"] self.encoder_dict_ = {} total_pos = temp["target"].sum() total_neg = len(temp) - total_pos temp["non_target"] = np.where(temp["target"] == 1, 0, 1) for var in self.variables: pos = temp.groupby([var])["target"].sum() / total_pos neg = temp.groupby([var])["non_target"].sum() / total_neg t = pd.concat([pos, neg], axis=1) t["woe"] = np.log(t["target"] / t["non_target"]) if (not t.loc[t["target"] == 0, :].empty or not t.loc[t["non_target"] == 0, :].empty): raise ValueError( "The proportion of one of the classes for a category in " "variable {} is zero, and log of zero is not defined". format(var)) self.encoder_dict_[var] = t["woe"].to_dict() self._check_encoding_dictionary() self.input_shape_ = X.shape return self
def test_value_counts_inferred(self): klasses = [Index, Series] for klass in klasses: s_values = ["a", "b", "b", "b", "b", "c", "d", "d", "a", "a"] s = klass(s_values) expected = Series([4, 3, 2, 1], index=["b", "a", "d", "c"]) tm.assert_series_equal(s.value_counts(), expected) self.assert_numpy_array_equal(s.unique(), np.unique(s_values)) self.assertEqual(s.nunique(), 4) # don't sort, have to sort after the fact as not sorting is # platform-dep hist = s.value_counts(sort=False).sort_values() expected = Series([3, 1, 4, 2], index=list("acbd")).sort_values() tm.assert_series_equal(hist, expected) # sort ascending hist = s.value_counts(ascending=True) expected = Series([1, 2, 3, 4], index=list("cdab")) tm.assert_series_equal(hist, expected) # relative histogram. hist = s.value_counts(normalize=True) expected = Series([0.4, 0.3, 0.2, 0.1], index=["b", "a", "d", "c"]) tm.assert_series_equal(hist, expected) # bins self.assertRaises(TypeError, lambda bins: s.value_counts(bins=bins), 1) s1 = Series([1, 1, 2, 3]) res1 = s1.value_counts(bins=1) exp1 = Series({0.998: 4}) tm.assert_series_equal(res1, exp1) res1n = s1.value_counts(bins=1, normalize=True) exp1n = Series({0.998: 1.0}) tm.assert_series_equal(res1n, exp1n) self.assert_numpy_array_equal(s1.unique(), np.array([1, 2, 3])) self.assertEqual(s1.nunique(), 3) res4 = s1.value_counts(bins=4) exp4 = Series({0.998: 2, 1.5: 1, 2.0: 0, 2.5: 1}, index=[0.998, 2.5, 1.5, 2.0]) tm.assert_series_equal(res4, exp4) res4n = s1.value_counts(bins=4, normalize=True) exp4n = Series({0.998: 0.5, 1.5: 0.25, 2.0: 0.0, 2.5: 0.25}, index=[0.998, 2.5, 1.5, 2.0]) tm.assert_series_equal(res4n, exp4n) # handle NA's properly s_values = ["a", "b", "b", "b", np.nan, np.nan, "d", "d", "a", "a", "b"] s = klass(s_values) expected = Series([4, 3, 2], index=["b", "a", "d"]) tm.assert_series_equal(s.value_counts(), expected) self.assert_numpy_array_equal(s.unique(), np.array(["a", "b", np.nan, "d"], dtype="O")) self.assertEqual(s.nunique(), 3) s = klass({}) expected = Series([], dtype=np.int64) tm.assert_series_equal(s.value_counts(), expected, check_index_type=False) self.assert_numpy_array_equal(s.unique(), np.array([])) self.assertEqual(s.nunique(), 0) # GH 3002, datetime64[ns] # don't test names though txt = "\n".join( [ "xxyyzz20100101PIE", "xxyyzz20100101GUM", "xxyyzz20100101EGG", "xxyyww20090101EGG", "foofoo20080909PIE", "foofoo20080909GUM", ] ) f = StringIO(txt) df = pd.read_fwf(f, widths=[6, 8, 3], names=["person_id", "dt", "food"], parse_dates=["dt"]) s = klass(df["dt"].copy()) s.name = None idx = pd.to_datetime(["2010-01-01 00:00:00Z", "2008-09-09 00:00:00Z", "2009-01-01 00:00:00X"]) expected_s = Series([3, 2, 1], index=idx) tm.assert_series_equal(s.value_counts(), expected_s) expected = np_array_datetime64_compat( ["2010-01-01 00:00:00Z", "2009-01-01 00:00:00Z", "2008-09-09 00:00:00Z"], dtype="datetime64[ns]" ) if isinstance(s, DatetimeIndex): expected = DatetimeIndex(expected) self.assertTrue(s.unique().equals(expected)) else: self.assert_numpy_array_equal(s.unique(), expected) self.assertEqual(s.nunique(), 3) # with NaT s = df["dt"].copy() s = klass([v for v in s.values] + [pd.NaT]) result = s.value_counts() self.assertEqual(result.index.dtype, "datetime64[ns]") tm.assert_series_equal(result, expected_s) result = s.value_counts(dropna=False) expected_s[pd.NaT] = 1 tm.assert_series_equal(result, expected_s) unique = s.unique() self.assertEqual(unique.dtype, "datetime64[ns]") # numpy_array_equal cannot compare pd.NaT self.assert_numpy_array_equal(unique[:3], expected) self.assertTrue(unique[3] is pd.NaT or unique[3].astype("int64") == pd.tslib.iNaT) self.assertEqual(s.nunique(), 3) self.assertEqual(s.nunique(dropna=False), 4) # timedelta64[ns] td = df.dt - df.dt + timedelta(1) td = klass(td, name="dt") result = td.value_counts() expected_s = Series([6], index=[Timedelta("1day")], name="dt") tm.assert_series_equal(result, expected_s) expected = TimedeltaIndex(["1 days"]) if isinstance(td, TimedeltaIndex): self.assertTrue(td.unique().equals(expected)) else: self.assert_numpy_array_equal(td.unique(), expected.values) td2 = timedelta(1) + (df.dt - df.dt) td2 = klass(td2, name="dt") result2 = td2.value_counts() tm.assert_series_equal(result2, expected_s)
l = l.value_counts() print "Question:3." print "-----------" print "During what hour was the server the busiest in terms of requests?" print "Answer:" print "-------" print "The MAXIMUM number of requests were made in the hour '%s'.\nIn this hour, a total of %d requests were made" % (l.idxmax(),l.max()) l = HTTP_DF[HTTP_DF['url'].str.contains('.gif',case=False)]['url'] l = l.value_counts() print "\n" print "Question:4." print "-----------" print "Which .gif image was downloaded the most during the day?" print "Answer:" print "-------" print "The MAXIMUM number of downloads were made for the image '%s'.\nThis image was downloaded %d times" % (l.idxmax(),l.max()) l = HTTP_DF[HTTP_DF['retcode'] != 200]['retcode'] print "\n" print "Question:5." print "-----------" print "What HTTP reply codes were sent other than 200?" print "Answer:" print "-------" print "The following return codes (other than 200) wrere sent:" print l.unique()
5 Mexico 6 Canada 7 Canada 8 Canada 9 Canada 10 Canada 11 NaN 12 NaN 13 NaN 14 NaN dtype: object ''' ser1.drop('b') # drops index 'b' and its associated value ser1.unique() # returns unique values within a series ser1.value_counts() # returns counts of values in a Series ''' w 3 y 2 a 1 z 1 x 1 dtype: int64 ''' # hierarchical indexes are illustrated by these examples ser = Series(randn(6), index = [[1,1,1,2,2,2],['a','b','c','a','b','c']]) ''' 1 a 0.187640
# -*- coding: utf-8 -*- import numpy as np import pandas as pd from pandas import Series, DataFrame print '去重' obj = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c']) print obj.unique() print obj.value_counts() print print '判断元素存在' mask = obj.isin(['b', 'c']) print mask print obj[mask] #只打印元素b和c data = DataFrame({'Qu1':[1, 3, 4, 3, 4], 'Qu2':[2, 3, 1, 2, 3], 'Qu3':[1, 5, 2, 4, 4]}) print data print data.apply(pd.value_counts).fillna(0) print data.apply(pd.value_counts, axis = 1).fillna(0)
a = s.values # get index i = s.index # assign name s.name = 'name' # length assert len(s) == s.size == s.shape[0] # number of element that a not NaN s.count() # get a array of unique values s.unique() # count(*) group by non-NaN value, get a Series s.value_counts() # aggregation and statistic s.max() s.mean() s.var() # location of the max element s.idxmax() # rank s = Series([4, 1, 2, 5]) s.rank() # return [3,1,2,4]
def count_result(ser: pd.Series): return ser.unique(), ser.value_counts()
def value_encoder(labels: pd.Series) -> Dict[Any, int]: ret = {} uniques = labels.unique() for index, label in enumerate(uniques): ret[label] = index return ret
price volume = DataFrame({tic: data['Volume'] for tic, data in all_data.iteritems()}) # percent changes of the prices: returns = price.pct_change() returns.tails() returns.tail() returns.MSFT.corr(returns.IBM) # correlation of the overlapping non-NA returns.MSFT.cov(returns.IBM) # covariance of the overlapping non-NA returns.corr() returns.cov() returns.corrwith(returns.IBM) returns.corrwith(volume) ## Unique values, Value counts, and membership obj = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c']) uniques = obj.unique() uniques obj.value_counts() obj.value_counts() # value frequencies from pandas import value_counts value_counts(obj.values, sort=False) obj mask = obj.isin(['b', 'c']) obj[mask] mask data = DataFrame({'Qu1': [1, 3, 4, 3, 4], 'Qu2': [2, 3, 1, 2, 3], 'Qu3': [1, 5, 2, 4, 4]}) data data.Qu1 data.Qu1.value_counts
def generate_arb_id_dictionary(self, a_timer: PipelineTimer, normalize_strategy: Callable, time_conversion: int = 1000, freq_analysis_accuracy: float = 0.0, freq_synchronous_threshold: float = 0.0, force: bool = False) -> (dict, dict): id_dictionary = {} j1979_dictionary = {} if force: # Remove any existing pickled Arb ID and J1979 dictionaries and create new ones based on data_filename. if path.isfile(self.id_output_filename): remove(self.id_output_filename) if path.isfile(self.j1979_output_filename): remove(self.j1979_output_filename) self.import_csv(a_timer, self.data_filename) elif path.isfile(self.id_output_filename): # This logic assumes that there will be a J1979 dict if and only if there is an Arb ID dict print("\tLoading Arb ID dictionary from pickled data: " + getcwd() + "\\" + self.id_output_filename) id_dictionary = load(open(self.id_output_filename, "rb")) if path.isfile(self.j1979_output_filename): print("\tLoading J1979 dictionary from pickled data: " + getcwd() + "\\" + self.j1979_output_filename) j1979_dictionary = load(open(self.j1979_output_filename, "rb")) print( "\tSet 'force_pre_processing' in Sample.py to True to re-compute instead..." ) return id_dictionary, j1979_dictionary else: self.import_csv(a_timer, self.data_filename) a_timer.start_function_time() for arb_id in Series.unique(self.data['id']): if isinstance(arb_id, int64): if arb_id == 2015: # This is the J1979 requests (if any) (ID 0x7DF = 2015). Just ignore it. continue elif arb_id == 2024 and self.use_j1979: # This is the J1979 responses (ID 0x7DF & 0x8 = 0x7E8 = 2024) j1979_data = self.data.loc[self.data['id'] == arb_id].copy() j1979_data.drop('dlc', axis=1, inplace=True) j1979_data.drop('id', axis=1, inplace=True) a_timer.start_nested_function_time() j1979_dictionary = self.generate_j1979_dictionary( j1979_data) a_timer.set_j1979_creation() elif arb_id > 0: a_timer.start_iteration_time() this_id = ArbID(arb_id) this_id.original_data = self.data.loc[self.data['id'] == arb_id] this_id.original_data = this_id.original_data.copy( ) # type: DataFrame # Check if the Arbitration ID always used the same DLC. If not, ignore it. # We can effectively ignore this Arb ID by not adding it to the Arb ID dictionary. if this_id.original_data['dlc'].nunique() is not 1: continue this_id.dlc = this_id.original_data['dlc'].iloc[0] this_id.original_data.drop('dlc', axis=1, inplace=True) this_id.original_data.drop('id', axis=1, inplace=True) # If DLC < 8, we can automatically drop data column vectors > DLC. # E.G. drop bytes "B7" and "B6" if DLC is 6; those are padding data injected by can-dump and were # not actually on the bus. if this_id.dlc < 8: for i in range(this_id.dlc, 8): this_id.original_data.drop('b' + str(i), axis=1, inplace=True) # Check if there are duplicate index values and correct them. if not this_id.original_data.index.is_unique: correction_mask = this_id.original_data.index.duplicated( ) this_id.original_data = this_id.original_data[ ~correction_mask] this_id.generate_binary_matrix_and_tang( a_timer, normalize_strategy) this_id.analyze_transmission_frequency( time_convert=time_conversion, ci_accuracy=freq_analysis_accuracy, synchronous_threshold=freq_synchronous_threshold) id_dictionary[arb_id] = this_id a_timer.set_arb_id_creation() a_timer.set_raw_df_to_arb_id_dict() return id_dictionary, j1979_dictionary
''' 0 w 1 y 2 a 3 w 4 y 5 z 6 b 7 q 8 w 9 g 10 h dtype: object ''' print(ser1.unique()) ''' dtype: object ['w' 'y' 'a' 'z' 'b' 'q' 'g' 'h'] ''' # count values print(ser1.value_counts()) ''' w 3 y 2 q 1 g 1 z 1 h 1 b 1
# ## 12,唯一值,值计数和成员资格 # In[212]: obj=Series(['c','a','d','a','a','b','d','d','c']) # In[213]: obj # In[215]: obj.unique() # **value_counts返回某值出现的频率,默认降序。还是一个顶级的pandas方法,可用于任何数组或序列** # In[216]: obj.value_counts() # In[217]: pd.value_counts(obj.values,sort=False) # **isin;判断矢量化集合的成员资格**
#基础统计功能 df7 = DataFrame([[1.4,np.nan],[7.1,-4.5],[np.nan,np.nan],[0.75,-1.3]],index=['a','b','c','d'],columns=['one','two']) df7.sum() df7.sum(axis=1) #包含NaN的值,不进行平均值计算 df7.mean(axis=1,skipna=False) #最大值所在的索引 df7.idxmax() #返回累加和 df7.cumsum() #返回多种统计集合的结果 df7.describe() #唯一值和值计数 obj = Series(['c','a','d','a','a','b','b','c','c']) unique = obj.unique() obj.value_counts() pd.value_counts(obj.values,sort=True) mask = obj.isin(['b','c']) obj[mask] #缺失数据判断 data8 = Series(['a','b',np.nan,'d']) data8.isnull() data8[2] = None data8.isnull() data9 = Series([1,np.nan,2,np.nan]) #DataFrame中dropna的功能扩展 data9.dropna()
count 4.000000 mean 4.500000 std 2.516611 min 2.000000 25% 3.500000 50% 4.000000 75% 5.000000 max 8.000000 dtype: float64 ''' print '去重' obj = Series(['c', 'a', 'd', 'b', 'b', 'c']) print obj.unique() print obj.value_counts() print print '判断元素存在' mask = obj.isin(['b', 'c']) print mask print obj[mask] # 只打印元素b和c data = DataFrame({'Qu1': [1, 3, 4, 3, 4], 'Qu2': [2, 3, 1, 2, 3], 'Qu3': [1, 5, 2, 4, 4]}) print
import matplotlib.pyplot as plt array1 = np.array([[10, np.nan, 20], [30, 40, np.nan]]) print array1 df1 = DataFrame(array1, index=[1, 2], columns=list('ABC')) print df1 #sum() print df1.sum() #sums along each column print df1.sum(axis=1) #sum along indexes print df1.min() print df1.max() print df1.idxmax() print df1.cumsum() print df1.describe() df2 = DataFrame(randn(9).reshape(3, 3), index=[1, 2, 3], columns=list('ABC')) print df2 plt.plot(df2) plt.legend(df2.columns, loc="lower right") plt.savefig('samplepic.png') plt.show() ser1 = Series(list('abcccaabd')) print ser1.unique() print ser1.value_counts()
print obj.describe() # 对Series计算汇总统计 ''' count 4.000000 mean 4.500000 std 2.516611 min 2.000000 25% 3.500000 50% 4.000000 75% 5.000000 max 8.000000 dtype: float64 ''' print '去重' obj = Series(['c', 'a', 'd','b', 'b', 'c']) print obj.unique() print obj.value_counts() print print '判断元素存在' mask = obj.isin(['b', 'c']) print mask print obj[mask] #只打印元素b和c data = DataFrame({'Qu1':[1, 3, 4, 3, 4], 'Qu2':[2, 3, 1, 2, 3], 'Qu3':[1, 5, 2, 4, 4]}) print data print data.apply(pd.value_counts).fillna(0) # 计算每列中各个数字出现的次数,缺失值为0 print data.apply(pd.value_counts, axis = 1).fillna(0) # 计算每行中各个数字出现的次数,缺失值为0
# -*- coding: utf-8 -*- import numpy as np import pandas as pd from pandas import Series, DataFrame print('去重') obj = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c']) print(obj.unique()) print(obj.value_counts()) print() print('判断元素存在') mask = obj.isin(['b', 'c']) print(mask) print(obj[mask]) # 只打印元素b和c data = DataFrame({'Qu1': [1, 3, 4, 3, 4], 'Qu2': [2, 3, 1, 2, 3], 'Qu3': [1, 5, 2, 4, 4]}) print(data) print(data.apply(pd.value_counts).fillna(0)) print(data.apply(pd.value_counts, axis=1).fillna(0))
def main(): set_logger() logger = logging.getLogger('clusters.main') logger.info("Parsing arguments") file, clusters = get_args() logger.info("Load data and check for data consistency") df = pd.read_json(file) df = df.dropna(axis = 0, how='any') X = df.loc[:, ['lat', 'lng']].values logger.info("Performing KMeans clustering") kmeans = KMeans(n_clusters=clusters, max_iter=1000).fit(X) #Cluster metadata logger.info("Calculating cluster metadata") centers = {k: v for k, v in enumerate(kmeans.cluster_centers_)} logger.info("Counting number of crimes of each cluster") labels = Series(kmeans.labels_) num_labels = {} for l in labels.unique(): num = labels[labels == l].count() num_labels[l] = num logger.debug("Number of occurrences of each label: {}".format(num_labels)) logger.info("Transforming counting into percentage") total = labels.count() percentage = {k: v/total for k, v in num_labels.items()} logger.debug("Percentage of each label: {}".format(percentage)) logger.info("Removing clusters with few points") labels_remove, lost_points = filter_clusters(percentage, X, labels) for l in labels_remove: percentage.pop(l, 'None') num_labels.pop(l, 'None') centers.pop(l, 'None') logger.debug("Number of occurrences of each label after filtering: {}".format(num_labels)) logger.debug("Percentage of each label after filtering: {}".format(percentage)) logger.debug("Number of filtered points: {}".format(lost_points.shape[0])) logger.info("Assign lost points to new clusters") new_labels = new_assignement(lost_points, centers) for l in new_labels: num_labels[l] += 1 percentage = {k: v/total for k, v in num_labels.items()} logger.info("Save results in JSON") #Separate center into lat and lng centers_lat, centers_lng= {}, {} for k, v in centers.items(): centers_lat[k], centers_lng[k] = v[0], v[1] columns = [ 'Number of crimes', 'Percentage of total crimes', 'lat', 'lng' ] df_meta = DataFrame({ columns[0]: num_labels, columns[1]: percentage, columns[2]: centers_lat, columns[3]: centers_lng }) _, basename = os.path.split(file) basename, _ = os.path.splitext(basename) df_meta.to_json( path_or_buf=basename+'Cluster.json', orient='records' )
# 전체 행이나 컬럼의 값이 NA가 아니라면 NA 값은 제외시키고 계산을 하는데 # skipna 옵션은 전체 행이나 컬럼의 값이 NA가 아니라도 제외시키지 않을 수 있다. # skipna의 기본값은 True print(df.sum(axis=1, skipna=False)) # idxmin, idxmax와 같은 메서드는 최소, 최대값을 가지고 있는 색인 값 같은 간접 통계를 반환한다. print(df.idxmax()) print(df.idxmin()) # 누산 메서드 : cumsum() print(df.cumsum()) # unique() : 중복된 값을 하나로 묶음 s1 = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c']) print(s1.unique()) # value_counts() : 값의 수를 계산(도수, 카운팅), 반환값은 Series 객체 print(s1.value_counts()) # 결과값이 내림차순으로 출력됨. # isin() : 어떤 값이 Series에 있는지 나타내는 메서드 ## boolean type(True, False)을 반환한다. mask = s1.isin(['b', 'c']) print(mask) print(s1[mask]) # 이런식으로 원하는 값만 뽑아낼 수 있다. data = DataFrame({ 'Q1': [1, 3, 4, 3, 4], 'Q2': [2, 3, 1, 2, 3], 'Q3': [1, 5, 2, 4, 4] })
# In[147]: df.mean() # In[148]: df.idxmax() # In[149]: df.describe() # In[151]: obj8 = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c']) u = obj8.unique() u # In[155]: obj8.value_counts() # In[157]: pd.value_counts(obj8.values, sort=False) # In[158]: mask = obj8.isin(['b', 'c']) mask
def test_value_counts_inferred(self): klasses = [Index, Series] for klass in klasses: s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a'] s = klass(s_values) expected = Series([4, 3, 2, 1], index=['b', 'a', 'd', 'c']) tm.assert_series_equal(s.value_counts(), expected) self.assert_numpy_array_equal(s.unique(), np.unique(s_values)) self.assertEqual(s.nunique(), 4) # don't sort, have to sort after the fact as not sorting is platform-dep hist = s.value_counts(sort=False) hist.sort() expected = Series([3, 1, 4, 2], index=list('acbd')) expected.sort() tm.assert_series_equal(hist, expected) # sort ascending hist = s.value_counts(ascending=True) expected = Series([1, 2, 3, 4], index=list('cdab')) tm.assert_series_equal(hist, expected) # relative histogram. hist = s.value_counts(normalize=True) expected = Series([.4, .3, .2, .1], index=['b', 'a', 'd', 'c']) tm.assert_series_equal(hist, expected) # bins self.assertRaises(TypeError, lambda bins: s.value_counts(bins=bins), 1) s1 = Series([1, 1, 2, 3]) res1 = s1.value_counts(bins=1) exp1 = Series({0.998: 4}) tm.assert_series_equal(res1, exp1) res1n = s1.value_counts(bins=1, normalize=True) exp1n = Series({0.998: 1.0}) tm.assert_series_equal(res1n, exp1n) self.assert_numpy_array_equal(s1.unique(), np.array([1, 2, 3])) self.assertEqual(s1.nunique(), 3) res4 = s1.value_counts(bins=4) exp4 = Series({ 0.998: 2, 1.5: 1, 2.0: 0, 2.5: 1 }, index=[0.998, 2.5, 1.5, 2.0]) tm.assert_series_equal(res4, exp4) res4n = s1.value_counts(bins=4, normalize=True) exp4n = Series({ 0.998: 0.5, 1.5: 0.25, 2.0: 0.0, 2.5: 0.25 }, index=[0.998, 2.5, 1.5, 2.0]) tm.assert_series_equal(res4n, exp4n) # handle NA's properly s_values = [ 'a', 'b', 'b', 'b', np.nan, np.nan, 'd', 'd', 'a', 'a', 'b' ] s = klass(s_values) expected = Series([4, 3, 2], index=['b', 'a', 'd']) tm.assert_series_equal(s.value_counts(), expected) self.assert_numpy_array_equal( s.unique(), np.array(['a', 'b', np.nan, 'd'], dtype='O')) self.assertEqual(s.nunique(), 3) s = klass({}) expected = Series([], dtype=np.int64) tm.assert_series_equal(s.value_counts(), expected) self.assert_numpy_array_equal(s.unique(), np.array([])) self.assertEqual(s.nunique(), 0) # GH 3002, datetime64[ns] txt = "\n".join([ 'xxyyzz20100101PIE', 'xxyyzz20100101GUM', 'xxyyzz20100101EGG', 'xxyyww20090101EGG', 'foofoo20080909PIE', 'foofoo20080909GUM' ]) f = StringIO(txt) df = pd.read_fwf(f, widths=[6, 8, 3], names=["person_id", "dt", "food"], parse_dates=["dt"]) s = klass(df['dt'].copy()) idx = pd.to_datetime([ '2010-01-01 00:00:00Z', '2008-09-09 00:00:00Z', '2009-01-01 00:00:00X' ]) expected_s = Series([3, 2, 1], index=idx) tm.assert_series_equal(s.value_counts(), expected_s) expected = np.array([ '2010-01-01 00:00:00Z', '2009-01-01 00:00:00Z', '2008-09-09 00:00:00Z' ], dtype='datetime64[ns]') if isinstance(s, DatetimeIndex): expected = DatetimeIndex(expected) self.assertTrue(s.unique().equals(expected)) else: self.assert_numpy_array_equal(s.unique(), expected) self.assertEqual(s.nunique(), 3) # with NaT s = df['dt'].copy() s = klass([v for v in s.values] + [pd.NaT]) result = s.value_counts() self.assertEqual(result.index.dtype, 'datetime64[ns]') tm.assert_series_equal(result, expected_s) result = s.value_counts(dropna=False) expected_s[pd.NaT] = 1 tm.assert_series_equal(result, expected_s) unique = s.unique() self.assertEqual(unique.dtype, 'datetime64[ns]') # numpy_array_equal cannot compare pd.NaT self.assert_numpy_array_equal(unique[:3], expected) self.assertTrue(unique[3] is pd.NaT or unique[3].astype('int64') == pd.tslib.iNaT) self.assertEqual(s.nunique(), 3) self.assertEqual(s.nunique(dropna=False), 4) # timedelta64[ns] td = df.dt - df.dt + timedelta(1) td = klass(td) result = td.value_counts() expected_s = Series([6], index=[Timedelta('1day')]) tm.assert_series_equal(result, expected_s) expected = TimedeltaIndex(['1 days']) if isinstance(td, TimedeltaIndex): self.assertTrue(td.unique().equals(expected)) else: self.assert_numpy_array_equal(td.unique(), expected.values) td2 = timedelta(1) + (df.dt - df.dt) td2 = klass(td2) result2 = td2.value_counts() tm.assert_series_equal(result2, expected_s)
def woe_analysis(df1, target, max_bin, force_bin): """ wrapper function for mono_bin, char_bin, and woe_graph functions. This will automatically construct bins for each variable. For numerical variables, it will create bins such that the WOE relationship between bins is monotonic. Parameters ---------- df1 : pandas dataframe training dataset target : pandas series target vector max_bin : int the maximum number of bins (categories) for numeric variable binning. force_bin : int For some numeric variables, the mono_bin function may produce only one category while binning. ‘force_bin’ ensures that at least produces two categories will be produced. Return ------ iv_df : pandas dataframe Weight of evidence / information value table and other data used to calculate WOE and IV for variables in dataset iv : pandas dataframe Information value table for variables in dataset """ max_bin = max_bin force_bin = force_bin stack = traceback.extract_stack() filename, lineno, function_name, code = stack[-2] vars_name = re.compile(r'\((.*?)\).*$').search(code).groups()[0] final = (re.findall(r"[\w']+", vars_name))[-1] x = df1.dtypes.index count = -1 for i in x: if i.upper() not in (final.upper()): if np.issubdtype(df1[i], np.number) and len(Series.unique(df1[i])) > 2: conv = mono_bin(target, df1[i], max_bin, force_bin) conv["VAR_NAME"] = i count = count + 1 woe_graph(conv, True) else: conv = char_bin(target, df1[i]) conv["VAR_NAME"] = i count = count + 1 conv = conv.sort_values(by='WOE', ascending=False) woe_graph(conv, False) if count == 0: iv_df = conv else: iv_df = iv_df.append(conv, ignore_index=True) iv = pd.DataFrame({'IV': iv_df.groupby('VAR_NAME').IV.max()}) iv = iv.reset_index() return iv_df, iv
print(df1.sum(axis=1)) print(df1.min()) print(df1.max()) print('----------') print(df1.idxmax()) print(df1.idxmin()) print('----------') print(df1.cumsum()) print('----------') print(df1.describe()) print('----------') df2 = DataFrame(randn(9).reshape(3, 3), index=[1, 2, 3], columns=list('ABC')) print(df2) print('----------') plt.plot(df2) plt.legend(df2.columns, loc="lower right") plt.savefig("first graph in python") plt.show() ser1 = Series(list('abcccaabd')) print(ser1.unique()) print(ser1.value_counts())
def woe_conversion(df, woe): """ Converts the values of each variable for each borrower from its original value into the weight of evidence (WOE) values of the variable bin that the input value is in. Parameters ---------- df : pandas dataframe Cleaned explanatory variable training / testing / validation data frame that will be used to fit the model. woe : pandas dataframe WOE / IV table that is output from woe_analysis() function Return ------ df_copy : pandas dataframe Converted dataframe, from original input values to corresponding WOE values """ df_copy = df.copy() woe_df = woe.copy() var_list = list(df_copy) for i in range(0, len(var_list)): var_str = "'%s'" % var_list[i] var_woe = woe_df.loc[woe_df['VAR_NAME'] == var_list[i]].copy() var_woe['max_range'] = var_woe['MAX_VALUE'] if np.issubdtype(df_copy[var_list[i]], np.number) and\ (len(Series.unique(df_copy[var_list[i]])) > 2): var_woe['min_range'] = var_woe.groupby( 'VAR_NAME')['MAX_VALUE'].shift(1) var_woe.loc[var_woe['MIN_VALUE'].isnull(), 'min_range'].isnull() var_woe.loc[var_woe['min_range'].isnull(), 'min_range'] = var_woe['MIN_VALUE'] else: var_woe['min_range'] = var_woe['MIN_VALUE'] var_woe_clean = var_woe[var_woe['MIN_VALUE'].notnull()] var_woe_null = var_woe[var_woe['MIN_VALUE'].isnull()] if not var_woe_null.empty: woe_null = var_woe_null.iloc[0]['WOE'] else: woe_null = np.nan min_value_list = var_woe_clean['min_range'].tolist() max_value_list = var_woe_clean['max_range'].tolist() choices = var_woe_clean['WOE'].tolist() cond_str_list = [] N = len(min_value_list) for j in range(0, len(min_value_list)): #condition for binary indicator variables if np.issubdtype(df_copy[var_list[i]], np.number) and \ (len(Series.unique(df_copy[var_list[i]])) == 2) and \ min_value_list[j] == max_value_list[j]: com_str = "(df_copy[" + var_str + "] ==" + str( min_value_list[j]) + ")" elif np.issubdtype(df_copy[var_list[i]], np.number): if j == 0: com_str = "(df_copy[" + var_str + "] <=" + str( max_value_list[j]) + ")" elif j == (N - 1): com_str = "(df_copy[" + var_str + "] >" + str( min_value_list[j]) + ")" else: com_str = "(df_copy[" + var_str + "] >" + str( min_value_list[j] ) + ") & (df_copy[" + var_str + "] <=" + str( max_value_list[j]) + ")" else: char_str = "'%s'" % min_value_list[j] com_str = "(df_copy[" + var_str + "] ==" + char_str + ")" cond_str_list.append(com_str) full_conds = ','.join(cond_str_list) conditions = eval(full_conds) var_woe_label = var_list[i] + '_woe' df_copy[var_woe_label] = np.select(conditions, choices, default=woe_null) df_copy = df_copy.drop([var_list[i]], axis=1) return df_copy
def infer_problem_type(y: Series, silent=False) -> str: """ Identifies which type of prediction problem we are interested in (if user has not specified). Ie. binary classification, multi-class classification, or regression. """ if len(y) == 0: raise ValueError("provided labels cannot have length = 0") y = y.dropna( ) # Remove missing values from y (there should not be any though as they were removed in Learner.general_data_processing()) num_rows = len(y) unique_values = y.unique() MULTICLASS_LIMIT = 1000 # if numeric and class count would be above this amount, assume it is regression if num_rows > 1000: REGRESS_THRESHOLD = 0.05 # if the unique-ratio is less than this, we assume multiclass classification, even when labels are integers else: REGRESS_THRESHOLD = 0.1 unique_count = len(unique_values) if unique_count == 2: problem_type = BINARY reason = "only two unique label-values observed" elif y.dtype.name in ['object', 'category', 'string']: problem_type = MULTICLASS reason = f"dtype of label-column == {y.dtype.name}" elif np.issubdtype(y.dtype, np.floating): unique_ratio = unique_count / float(num_rows) if (unique_ratio <= REGRESS_THRESHOLD) and (unique_count <= MULTICLASS_LIMIT): try: can_convert_to_int = np.array_equal(y, y.astype(int)) if can_convert_to_int: problem_type = MULTICLASS reason = "dtype of label-column == float, but few unique label-values observed and label-values can be converted to int" else: problem_type = REGRESSION reason = "dtype of label-column == float and label-values can't be converted to int" except: problem_type = REGRESSION reason = "dtype of label-column == float and label-values can't be converted to int" else: problem_type = REGRESSION reason = "dtype of label-column == float and many unique label-values observed" elif np.issubdtype(y.dtype, np.integer): unique_ratio = unique_count / float(num_rows) if (unique_ratio <= REGRESS_THRESHOLD) and (unique_count <= MULTICLASS_LIMIT): problem_type = MULTICLASS # TODO: Check if integers are from 0 to n-1 for n unique values, if they have a wide spread, it could still be regression reason = "dtype of label-column == int, but few unique label-values observed" else: problem_type = REGRESSION reason = "dtype of label-column == int and many unique label-values observed" else: raise NotImplementedError(f'label dtype {y.dtype} not supported!') if not silent: logger.log( 25, f"AutoGluon infers your prediction problem is: '{problem_type}' (because {reason})." ) # TODO: Move this outside of this function so it is visible even if problem type was not inferred. if problem_type in [BINARY, MULTICLASS]: if unique_count > 10: logger.log( 20, f'\tFirst 10 (of {unique_count}) unique label values: {list(unique_values[:10])}' ) else: logger.log( 20, f'\t{unique_count} unique label values: {list(unique_values)}' ) elif problem_type == REGRESSION: y_max = y.max() y_min = y.min() y_mean = y.mean() y_stddev = y.std() logger.log( 20, f'\tLabel info (max, min, mean, stddev): ({y_max}, {y_min}, {round(y_mean, 5)}, {round(y_stddev, 5)})' ) logger.log( 25, f"\tIf '{problem_type}' is not the correct problem_type, please manually specify the problem_type parameter during predictor init " f"(You may specify problem_type as one of: {[BINARY, MULTICLASS, REGRESSION]})" ) return problem_type
def create_mappings(x: pd.Series): labels = x.unique() lbl2idx = {label: idx for idx, label in enumerate(labels)} return lbl2idx, labels
def test_value_counts_bins(self, klass): s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a'] s = klass(s_values) # bins with pytest.raises(TypeError): s.value_counts(bins=1) s1 = Series([1, 1, 2, 3]) res1 = s1.value_counts(bins=1) exp1 = Series({Interval(0.997, 3.0): 4}) tm.assert_series_equal(res1, exp1) res1n = s1.value_counts(bins=1, normalize=True) exp1n = Series({Interval(0.997, 3.0): 1.0}) tm.assert_series_equal(res1n, exp1n) if isinstance(s1, Index): tm.assert_index_equal(s1.unique(), Index([1, 2, 3])) else: exp = np.array([1, 2, 3], dtype=np.int64) tm.assert_numpy_array_equal(s1.unique(), exp) assert s1.nunique() == 3 # these return the same res4 = s1.value_counts(bins=4, dropna=True) intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0]) exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2])) tm.assert_series_equal(res4, exp4) res4 = s1.value_counts(bins=4, dropna=False) intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0]) exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2])) tm.assert_series_equal(res4, exp4) res4n = s1.value_counts(bins=4, normalize=True) exp4n = Series([0.5, 0.25, 0.25, 0], index=intervals.take([0, 3, 1, 2])) tm.assert_series_equal(res4n, exp4n) # handle NA's properly s_values = [ 'a', 'b', 'b', 'b', np.nan, np.nan, 'd', 'd', 'a', 'a', 'b' ] s = klass(s_values) expected = Series([4, 3, 2], index=['b', 'a', 'd']) tm.assert_series_equal(s.value_counts(), expected) if isinstance(s, Index): exp = Index(['a', 'b', np.nan, 'd']) tm.assert_index_equal(s.unique(), exp) else: exp = np.array(['a', 'b', np.nan, 'd'], dtype=object) tm.assert_numpy_array_equal(s.unique(), exp) assert s.nunique() == 3 s = klass({}) expected = Series([], dtype=np.int64) tm.assert_series_equal(s.value_counts(), expected, check_index_type=False) # returned dtype differs depending on original if isinstance(s, Index): tm.assert_index_equal(s.unique(), Index([]), exact=False) else: tm.assert_numpy_array_equal(s.unique(), np.array([]), check_dtype=False) assert s.nunique() == 0
def con_column_analysis(con_column: Series): print(con_column.describe()) print("There are " + con_column.isnull().sum() + " nan values.") print("Null values accounts for %2f".format(con_column.isnull().sum() / len(con_column))) print("This column has " + con_column.unique().__len__() + "items.")
} f = DataFrame(dic,index=np.arange(100,80,-1)) f2 = DataFrame({'line03':np.linspace(30,35,10),'line04':np.arange(10)},index=np.arange(100,90,-1)); f3 = f.add(f2) # 求和 # 求每一列的和 sum = f.sum() # 求指定列的和 sum1 = f[['line01','line02']].sum() # 求每一行的和 sum2 = f.sum(axis=1) # print sum # print sum1 # print sum2 # 若该行或列有NaN则运行结果为NaN,默认skipna为True,忽略NaN sum4 = f3.sum(skipna=False) # print sum4 # 获取所有值 S2 = Series(['c','d','a','c','c','c','r','a','d']) uniques = S2.unique() # print uniques # 获取每个值出现的次数 uniques_counts = S2.value_counts() # print uniques_counts # 获取DataFrame中多个列出现的信息 result = f.apply(pd.value_counts).fillna(0) print result
def cat_column_analysis(cat_column: Series): print(cat_column.describe()) print("There are " + cat_column.isnull().sum() + " nan values") print("Null values accounts for %2f".format(cat_column.isnull().sum() / len(cat_column))) print("In this column, there are ", cat_column.unique(), " items")
def main(): """ Calculation and aggregation of summary statistics """ # Summary of statistics # return is not ndarray df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]], index=list('abcd'), columns=['one', 'two']) print df print df.sum() print df.sum(axis=1) print df.mean(axis=1) # exclude nan print df.mean(axis=1, skipna=False) print df.idxmin() print df.idxmax() print df.cumsum() print df.describe() # values are not number obj = Series(list('aabc') * 4) print obj.describe() methods = ['count', 'min', 'max', # 'argmin', 'argmax', 'quantile', 'median', 'mad', 'var', 'std', 'skew', 'kurt', 'cummin', 'cummax', 'cumprod', 'diff', 'pct_change'] for method in methods: print u'「{0}」'.format(method) print getattr(df, method)() print '' # Correspond and Covariance all_data = {} lst = [] # ['AAPL', 'IBM', 'MSFT'] #, 'GOOG']: for ticket in lst: #, 'GOOG']: # IOError: after 3 tries, Yahoo! did not return a 200 # for url 'http://ichart.finance.yahoo.com/table.csv?s=GOOG&a=0&b=1&c=2000&d=0&e=1&f=2010&g=d&ignore=.csv' all_data[ticket] = pd.io.data.get_data_yahoo(ticket, '1/1/2000', '1/1/2010') price = DataFrame({tic: data['Adj Close'] for tic, data in all_data.iteritems()}) volume = DataFrame({tic: data['Volume'] for tic, data in all_data.iteritems()}) if all_data: returns = price.pct_change() print returns.tail() print '' print returns.MSFT.corr(returns.IBM) print returns.MSFT.cov(returns.IBM) print '' print returns.corr() print returns.cov() print '' print returns.corrwith(returns.IBM) print returns.corrwith(volume) # unique, frequency, belong print '','' obj = Series(list('cadaabbcc')) uniques = obj.unique() print uniques print obj.value_counts() print pd.value_counts(obj.values, sort=False) mask = obj.isin(['b', 'c']) print mask print obj[mask] data = DataFrame({ 'Qu1' : [1,3,4,3,4], 'Qu2' : [2,3,1,2,3], 'Qu3' : [1,5,2,4,4], }) print data print data.apply(pd.value_counts).fillna(0)
def create_property_values( row: pd.Series, scope: str, domain: str, dtypes: pd.Series ) -> dict: """ This function generates the property values for a row in a file Parameters ---------- row : pd.Series The current row of the data frame to create property values for scope : str The domain to create the property values in domain : str The domain to create the property values in dtypes : pd.Series The data types of each column to create property values for Returns ------- properties : dict {str, models.PerpetualProperty} """ # Ensure that all data types in the file have been mapped if not ( set([str(data_type) for data_type in dtypes.unique()]) <= set(global_constants["data_type_mapping"]) ): raise TypeError( """There are data types in the data_frame which have not been mapped to LUSID data types, please ensure that all data types have been mapped before retrying""" ) # Initialise the empty properties dictionary properties = {} # Iterate over each column name and data type for column_name, data_type in dtypes.iteritems(): # Set the data type to be a string so that it is easier to work with string_data_type = str(data_type) # Convert the numpy data type to a LUSID data type using the global mapping lusid_data_type = global_constants["data_type_mapping"][string_data_type] # Get the value of the column from the row row_value = row[column_name] # Use the correct LUSID property value based on the data type if lusid_data_type == "string": if pd.isna(row_value): continue property_value = lusid.models.PropertyValue(label_value=row_value) if lusid_data_type == "number": # Handle null values given the input null value override if pd.isnull(row_value): continue property_value = lusid.models.PropertyValue( metric_value=lusid.models.MetricValue(value=row_value) ) # Set the property property_key = ( f"{domain}/{scope}/{cocoon.utilities.make_code_lusid_friendly(column_name)}" ) properties[property_key] = lusid.models.PerpetualProperty( key=property_key, value=property_value ) if domain.lower() == "instrument": properties = list(properties.values()) return properties
def test_value_counts_inferred(self): klasses = [Index, Series] for klass in klasses: s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a'] s = klass(s_values) expected = Series([4, 3, 2, 1], index=['b', 'a', 'd', 'c']) tm.assert_series_equal(s.value_counts(), expected) self.assert_numpy_array_equal(s.unique(), np.unique(s_values)) self.assertEquals(s.nunique(), 4) # don't sort, have to sort after the fact as not sorting is platform-dep hist = s.value_counts(sort=False) hist.sort() expected = Series([3, 1, 4, 2], index=list('acbd')) expected.sort() tm.assert_series_equal(hist, expected) # sort ascending hist = s.value_counts(ascending=True) expected = Series([1, 2, 3, 4], index=list('cdab')) tm.assert_series_equal(hist, expected) # relative histogram. hist = s.value_counts(normalize=True) expected = Series([.4, .3, .2, .1], index=['b', 'a', 'd', 'c']) tm.assert_series_equal(hist, expected) # bins self.assertRaises(TypeError, lambda bins: s.value_counts(bins=bins), 1) s1 = Series([1, 1, 2, 3]) res1 = s1.value_counts(bins=1) exp1 = Series({0.998: 4}) tm.assert_series_equal(res1, exp1) res1n = s1.value_counts(bins=1, normalize=True) exp1n = Series({0.998: 1.0}) tm.assert_series_equal(res1n, exp1n) self.assert_numpy_array_equal(s1.unique(), np.array([1, 2, 3])) self.assertEquals(s1.nunique(), 3) res4 = s1.value_counts(bins=4) exp4 = Series({0.998: 2, 1.5: 1, 2.0: 0, 2.5: 1}, index=[0.998, 2.5, 1.5, 2.0]) tm.assert_series_equal(res4, exp4) res4n = s1.value_counts(bins=4, normalize=True) exp4n = Series({0.998: 0.5, 1.5: 0.25, 2.0: 0.0, 2.5: 0.25}, index=[0.998, 2.5, 1.5, 2.0]) tm.assert_series_equal(res4n, exp4n) # handle NA's properly s_values = ['a', 'b', 'b', 'b', np.nan, np.nan, 'd', 'd', 'a', 'a', 'b'] s = klass(s_values) expected = Series([4, 3, 2], index=['b', 'a', 'd']) tm.assert_series_equal(s.value_counts(), expected) self.assert_numpy_array_equal(s.unique(), np.array(['a', 'b', np.nan, 'd'], dtype='O')) self.assertEquals(s.nunique(), 3) s = klass({}) expected = Series([], dtype=np.int64) tm.assert_series_equal(s.value_counts(), expected) self.assert_numpy_array_equal(s.unique(), np.array([])) self.assertEquals(s.nunique(), 0) # GH 3002, datetime64[ns] txt = "\n".join(['xxyyzz20100101PIE', 'xxyyzz20100101GUM', 'xxyyzz20100101EGG', 'xxyyww20090101EGG', 'foofoo20080909PIE', 'foofoo20080909GUM']) f = StringIO(txt) df = pd.read_fwf(f, widths=[6, 8, 3], names=["person_id", "dt", "food"], parse_dates=["dt"]) s = klass(df['dt'].copy()) idx = pd.to_datetime(['2010-01-01 00:00:00Z', '2008-09-09 00:00:00Z', '2009-01-01 00:00:00X']) expected_s = Series([3, 2, 1], index=idx) tm.assert_series_equal(s.value_counts(), expected_s) expected = np.array(['2010-01-01 00:00:00Z', '2009-01-01 00:00:00Z', '2008-09-09 00:00:00Z'], dtype='datetime64[ns]') if isinstance(s, DatetimeIndex): expected = DatetimeIndex(expected) self.assert_(s.unique().equals(expected)) else: self.assert_numpy_array_equal(s.unique(), expected) self.assertEquals(s.nunique(), 3) # with NaT s = df['dt'].copy() s = klass([v for v in s.values] + [pd.NaT]) result = s.value_counts() self.assertEqual(result.index.dtype, 'datetime64[ns]') expected_s[pd.NaT] = 1 tm.assert_series_equal(result, expected_s) unique = s.unique() self.assertEqual(unique.dtype, 'datetime64[ns]') # numpy_array_equal cannot compare pd.NaT self.assert_numpy_array_equal(unique[:3], expected) self.assertTrue(unique[3] is pd.NaT or unique[3].astype('int64') == pd.tslib.iNaT) self.assertEquals(s.nunique(), 4) # timedelta64[ns] td = df.dt - df.dt + timedelta(1) td = klass(td) result = td.value_counts() expected_s = Series([6], index=[86400000000000]) self.assertEqual(result.index.dtype, 'int64') tm.assert_series_equal(result, expected_s) # get nanoseconds to compare expected = np.array([86400000000000]) self.assert_numpy_array_equal(td.unique(), expected) self.assertEquals(td.nunique(), 1) td2 = timedelta(1) + (df.dt - df.dt) td2 = klass(td2) result2 = td2.value_counts() self.assertEqual(result2.index.dtype, 'int64') tm.assert_series_equal(result2, expected_s) self.assert_numpy_array_equal(td.unique(), expected) self.assertEquals(td.nunique(), 1)
start=datetime.datetime(2010, 1, 1), end=datetime.datetime(2013, 1, 1))['Adj Close'] prices.head() volume = pdweb.get_data_yahoo(['CVX', 'XOM', 'BP'], start=datetime.datetime(2010, 1, 1), end=datetime.datetime(2013, 1, 1))['Volume'] volume.head() rets = prices.pct_change() # Correction of the stocks rcorr = rets.corr prices.plot() volume.plot() import seaborn as sns import matplotlib.pyplot as plt # seaborn correlation plot between pct change in stock price sns.corrplot(rets, annot=False, diag_names=False) prices.cov # covariance method # unique values of a series ser1 = Series(['w', 'w', 'x', 'y', 'z', 'w', 'x', 'y', 'x', 'a']) ser1.unique() ser1.value_counts()
return choice(range(len(weights)), p=weights) else: return None #lambda and stuff: is used to a function that generate samples.. #prove that sampling from norm distribution is unique.. ################################ #Because I set α=10 (which is relatively small), the approximation # is fairly course. In terms of memoization, a small α value means # the stochastic memoizer will more frequently reuse values already # seen instead of drawing new ones. ############################### base_measure = lambda: norm().rvs() ndraws = 10000 print("Number of unique samples after {} draws..".format(ndraws)) draws = Series([base_measure() for _ in range(ndraws)]) print(draws.unique().size) ################################ norm_dp = DirichletProcessSample(base_measure, alpha=100) print("Number of unique samples after {} draws:".format(ndraws)) dp_draws = Series([norm_dp() for _ in range(ndraws)]) print(dp_draws.unique().size) ################################# Series(norm_dp() for _ in range(10000)).hist() _=plt.title("Histogram of Samples from norm_dp") plt.show() ################################ norm_hdp = DirichletProcessSample(norm_dp, alpha=10) #samples.. Series(norm_hdp() for _ in range(10000)).hist() _=plt.title("Histogram of Samples from norm_hdp") plt.show() ###############################
all_data = {} for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']: print("get data:" + ticker) all_data[ticker] = web.get_data_yahoo(ticker, '1/1/2010', '1/30/2010') price = DataFrame({tic: data['Adj Close'] for tic, data in all_data.items()}) volume = DataFrame({tic: data['Volume'] for tic, data in all_data.items()}) returns = price.pct_change() print(returns.tail()) print(returns.MSFT.corr(returns.IBM)) print(returns.MSFT.cov(returns.IBM)) print(returns.corr()) print(returns.cov()) print(returns.corrwith(returns.IBM)) print(returns.corrwith(volume)) obj = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c']) print(obj.unique()) print(obj.value_counts()) print(pd.value_counts(obj.values, sort=False)) mask = obj.isin(['b', 'c']) print(mask) print(obj[mask]) data = DataFrame({ 'QU1': [1, 3, 4, 3, 4], 'QU2': [2, 3, 1, 2, 3], 'QU3': [1, 5, 2, 4, 4] }) print(data.apply(pd.value_counts).fillna(0))