def wordify(abs_list, min_word_len=2):
    '''
    Convert the abstract field from PLoS API data to a filtered list of words.
    '''

    # The abstract field is a list. Make it a string.
    text = ' '.join(abs_list).strip(' \n\t')

    if text == '':
        return nan

    else:
        # Remove punctuation & replace with space,
        # because we want 'metal-contaminated' => 'metal contaminated'
        # ...not 'metalcontaminated', and so on.
        for c in string.punctuation:
            text = text.replace(c, ' ')

        # Now make it a Series of words, and do some cleaning.
        words = Series(text.split(' '))
        words = words.str.lower()
        # Filter out words less than minimum word length.
        words = words[words.str.len() >= min_word_len]
        words = words[~words.str.contains(r'[^#@a-z]')]  # What exactly does this do?

        # Filter out globally-defined stopwords
        ignore = stops & set(words.unique())
        words_out = [w for w in words.tolist() if w not in ignore]

        return words_out
Example #2
0
def pd_01():
    obj=Series(['c','a','d','a','a','b','c'])
    uniques=obj.unique()
    print uniques
    print uniques.sort()
    print pd.value_counts(obj,sort=False)
    mask=obj.isin(['b','c'])
    print mask
    print obj[mask]
Example #3
0
    def test_value_counts_bins(self):
        klasses = [Index, Series]
        for klass in klasses:
            s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a']
            s = klass(s_values)

            # bins
            self.assertRaises(TypeError,
                              lambda bins: s.value_counts(bins=bins), 1)

            s1 = Series([1, 1, 2, 3])
            res1 = s1.value_counts(bins=1)
            exp1 = Series({0.998: 4})
            tm.assert_series_equal(res1, exp1)
            res1n = s1.value_counts(bins=1, normalize=True)
            exp1n = Series({0.998: 1.0})
            tm.assert_series_equal(res1n, exp1n)

            self.assert_numpy_array_equal(s1.unique(),
                                          np.array([1, 2, 3], dtype=np.int64))
            self.assertEqual(s1.nunique(), 3)

            res4 = s1.value_counts(bins=4)
            exp4 = Series({0.998: 2,
                           1.5: 1,
                           2.0: 0,
                           2.5: 1}, index=[0.998, 2.5, 1.5, 2.0])
            tm.assert_series_equal(res4, exp4)
            res4n = s1.value_counts(bins=4, normalize=True)
            exp4n = Series(
                {0.998: 0.5,
                 1.5: 0.25,
                 2.0: 0.0,
                 2.5: 0.25}, index=[0.998, 2.5, 1.5, 2.0])
            tm.assert_series_equal(res4n, exp4n)

            # handle NA's properly
            s_values = ['a', 'b', 'b', 'b', np.nan, np.nan,
                        'd', 'd', 'a', 'a', 'b']
            s = klass(s_values)
            expected = Series([4, 3, 2], index=['b', 'a', 'd'])
            tm.assert_series_equal(s.value_counts(), expected)

            exp = np.array(['a', 'b', np.nan, 'd'], dtype=np.object_)
            self.assert_numpy_array_equal(s.unique(), exp)
            self.assertEqual(s.nunique(), 3)

            s = klass({})
            expected = Series([], dtype=np.int64)
            tm.assert_series_equal(s.value_counts(), expected,
                                   check_index_type=False)
            # returned dtype differs depending on original
            self.assert_numpy_array_equal(s.unique(), np.array([]),
                                          check_dtype=False)
            self.assertEqual(s.nunique(), 0)
Example #4
0
def test_unique():
    # GH714 also, dtype=float
    s = Series([1.2345] * 100)
    s[::2] = np.nan
    result = s.unique()
    assert len(result) == 2

    s = Series([1.2345] * 100, dtype='f4')
    s[::2] = np.nan
    result = s.unique()
    assert len(result) == 2

    # NAs in object arrays #714
    s = Series(['foo'] * 100, dtype='O')
    s[::2] = np.nan
    result = s.unique()
    assert len(result) == 2

    # decision about None
    s = Series([1, 2, 3, None, None, None], dtype=object)
    result = s.unique()
    expected = np.array([1, 2, 3, None], dtype=object)
    tm.assert_numpy_array_equal(result, expected)

    # GH 18051
    s = Series(Categorical([]))
    tm.assert_categorical_equal(s.unique(), Categorical([]), check_dtype=False)
    s = Series(Categorical([np.nan]))
    tm.assert_categorical_equal(s.unique(), Categorical([np.nan]),
                                check_dtype=False)
def _process_target_or_features_for_plotting(target_or_features, type_,
                                             plot_std_max):

    if isinstance(target_or_features, Series):

        is_target = True

    elif isinstance(target_or_features, DataFrame):

        is_target = False

    else:

        raise ValueError(
            'target_or_features ({}) is neither a Series or DataFrame.'.format(
                type(target_or_features)))

    if type_ == 'continuous':

        if is_target:

            target_or_features = Series(normalize_nd_array(
                target_or_features.values,
                '-0-',
                None,
                raise_for_bad_value=False),
                                        name=target_or_features.name,
                                        index=target_or_features.index)

        else:

            target_or_features = DataFrame(
                normalize_nd_array(target_or_features.values,
                                   '-0-',
                                   1,
                                   raise_for_bad_value=False),
                index=target_or_features.index,
                columns=target_or_features.columns)

        plot_min = max(-plot_std_max, nanmin(target_or_features.values))

        plot_max = min(plot_std_max, nanmax(target_or_features.values))

        colorscale = CONTINUOUS_COLORSCALE_FOR_MATCH

    else:

        plot_min = 0

        if type_ == 'categorical':

            if is_target:

                plot_max = target_or_features.unique().size - 1

            else:

                plot_max = target_or_features.unstack().unique().size - 1

            colorscale = make_colorscale(colors=CATEGORICAL_COLORS)

        elif type_ == 'binary':

            plot_max = 1

            colorscale = make_colorscale(colors=BINARY_COLORS_WHITE_BLACK)

        else:

            raise ValueError('Unknown type_: {}.'.format(type_))

    return target_or_features, plot_min, plot_max, colorscale
Example #6
0
    def get_problem_type(y: Series):
        """ Identifies which type of prediction problem we are interested in (if user has not specified).
            Ie. binary classification, multi-class classification, or regression.
        """
        if len(y) == 0:
            raise ValueError("provided labels cannot have length = 0")
        y = y.dropna(
        )  # Remove missing values from y (there should not be any though as they were removed in Learner.general_data_processing())
        num_rows = len(y)

        unique_values = y.unique()
        unique_count = len(unique_values)
        if unique_count > 10:
            logger.log(
                20,
                f'Here are the first 10 unique label values in your data:  {list(unique_values[:10])}'
            )
        else:
            logger.log(
                20,
                f'Here are the {unique_count} unique label values in your data:  {list(unique_values)}'
            )

        MULTICLASS_LIMIT = 1000  # if numeric and class count would be above this amount, assume it is regression
        if num_rows > 1000:
            REGRESS_THRESHOLD = 0.05  # if the unique-ratio is less than this, we assume multiclass classification, even when labels are integers
        else:
            REGRESS_THRESHOLD = 0.1

        if unique_count == 2:
            problem_type = BINARY
            reason = "only two unique label-values observed"
        elif unique_values.dtype == 'object':
            problem_type = MULTICLASS
            reason = "dtype of label-column == object"
        elif np.issubdtype(unique_values.dtype, np.floating):
            unique_ratio = unique_count / float(num_rows)
            if (unique_ratio <= REGRESS_THRESHOLD) and (unique_count <=
                                                        MULTICLASS_LIMIT):
                try:
                    can_convert_to_int = np.array_equal(y, y.astype(int))
                    if can_convert_to_int:
                        problem_type = MULTICLASS
                        reason = "dtype of label-column == float, but few unique label-values observed and label-values can be converted to int"
                    else:
                        problem_type = REGRESSION
                        reason = "dtype of label-column == float and label-values can't be converted to int"
                except:
                    problem_type = REGRESSION
                    reason = "dtype of label-column == float and label-values can't be converted to int"
            else:
                problem_type = REGRESSION
                reason = "dtype of label-column == float and many unique label-values observed"
        elif np.issubdtype(unique_values.dtype, np.integer):
            unique_ratio = unique_count / float(num_rows)
            if (unique_ratio <= REGRESS_THRESHOLD) and (unique_count <=
                                                        MULTICLASS_LIMIT):
                problem_type = MULTICLASS  # TODO: Check if integers are from 0 to n-1 for n unique values, if they have a wide spread, it could still be regression
                reason = "dtype of label-column == int, but few unique label-values observed"
            else:
                problem_type = REGRESSION
                reason = "dtype of label-column == int and many unique label-values observed"
        else:
            raise NotImplementedError('label dtype', unique_values.dtype,
                                      'not supported!')
        logger.log(
            25,
            f"AutoGluon infers your prediction problem is: {problem_type}  (because {reason})."
        )
        logger.log(
            25,
            f"If this is wrong, please specify `problem_type` argument in fit() instead "
            f"(You may specify problem_type as one of: {[BINARY, MULTICLASS, REGRESSION]})\n"
        )
        return problem_type
Example #7
0
    def test_value_counts_bins(self):
        klasses = [Index, Series]
        for klass in klasses:
            s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a']
            s = klass(s_values)

            # bins
            self.assertRaises(TypeError,
                              lambda bins: s.value_counts(bins=bins), 1)

            s1 = Series([1, 1, 2, 3])
            res1 = s1.value_counts(bins=1)
            exp1 = Series({0.998: 4})
            tm.assert_series_equal(res1, exp1)
            res1n = s1.value_counts(bins=1, normalize=True)
            exp1n = Series({0.998: 1.0})
            tm.assert_series_equal(res1n, exp1n)

            if isinstance(s1, Index):
                tm.assert_index_equal(s1.unique(), Index([1, 2, 3]))
            else:
                exp = np.array([1, 2, 3], dtype=np.int64)
                tm.assert_numpy_array_equal(s1.unique(), exp)

            self.assertEqual(s1.nunique(), 3)

            res4 = s1.value_counts(bins=4)
            exp4 = Series({
                0.998: 2,
                1.5: 1,
                2.0: 0,
                2.5: 1
            },
                          index=[0.998, 2.5, 1.5, 2.0])
            tm.assert_series_equal(res4, exp4)
            res4n = s1.value_counts(bins=4, normalize=True)
            exp4n = Series({
                0.998: 0.5,
                1.5: 0.25,
                2.0: 0.0,
                2.5: 0.25
            },
                           index=[0.998, 2.5, 1.5, 2.0])
            tm.assert_series_equal(res4n, exp4n)

            # handle NA's properly
            s_values = [
                'a', 'b', 'b', 'b', np.nan, np.nan, 'd', 'd', 'a', 'a', 'b'
            ]
            s = klass(s_values)
            expected = Series([4, 3, 2], index=['b', 'a', 'd'])
            tm.assert_series_equal(s.value_counts(), expected)

            if isinstance(s, Index):
                exp = Index(['a', 'b', np.nan, 'd'])
                tm.assert_index_equal(s.unique(), exp)
            else:
                exp = np.array(['a', 'b', np.nan, 'd'], dtype=object)
                tm.assert_numpy_array_equal(s.unique(), exp)
            self.assertEqual(s.nunique(), 3)

            s = klass({})
            expected = Series([], dtype=np.int64)
            tm.assert_series_equal(s.value_counts(),
                                   expected,
                                   check_index_type=False)
            # returned dtype differs depending on original
            if isinstance(s, Index):
                self.assert_index_equal(s.unique(), Index([]), exact=False)
            else:
                self.assert_numpy_array_equal(s.unique(),
                                              np.array([]),
                                              check_dtype=False)

            self.assertEqual(s.nunique(), 0)
# 전체 행이나 column의 값이 NaN이 아니라면 NaN 값은 제외시키고 계산을 하는데
# skipna옵션은 전체 row나 column의 값이 NaN이 아니라도 제외시키지 않을 수 있다.
# skipna의 default는 True
print(df.sum(axis=1, skipna=False))  # 하나라도 NaN이 있으면 계산 x

# idxmin, idxmax와 같은 ㅅ메서드는 최소, 최댓값을 가지고 있는 색인 값 같은 간접 통계를 반환한다.
print(df.idxmax())
print(df.idxmin())

# 누산 메서드: cumsum()
print(df.cumsum())

# 유일한 값, 도수 메서드
s1 = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
unique = s1.unique()
print(unique)  # 중복된 값을 없애는 메서드
print(unique.sort())  # sort 못함

cnt = s1.value_counts()  # 값의 수를 계산(도수), 반환값은 Series 객체
print(cnt)  # 내림차순으로 정렬되며 같은 값은 먼저 나온 순서

# isin 메서드는 어떤 값이 Series에 있는지 나타내는 메서드
# boolean 값(Treu, False)을 반환한다.
# DataFrame, Series에서 원하는 값을 골라내고 싶을 때 유용하게 사용하는 메서드
mask = s1.isin(['b', 'c'])
print(mask)  # b 또는 c 가 있으면 true값 반환
print(s1[mask])  # mask를 씌워 원하는 값만 보여주는 것으로 활용 가능

data = DataFrame({
    'Q1': [1, 3, 4, 3, 4],
Example #9
0
# 描述统计
df = DataFrame([[1.4, np.nan], [7.1, -4.5], [np.nan, np.nan], [0.75, -1.3]],
               index=['a', 'b', 'c', 'd'],
               columns=['one', 'two'])
df
df.sum()
df.sum(axis=1)
df.mean(axis=1, skipna=False)
df.idxmax()
df.describe()
# corr cov corrwith

# 唯一值、值计数,成员资格
obj = Series(list('cadaabbcc'))
uniques = obj.unique()
uniques
uniques.sort()
obj.value_counts()
pd.value_counts(obj.values, sort=False)
mask = obj.isin(['b', 'c'])
mask
obj[mask]

# 处理丢失数据

dates = pd.date_range('20170101', periods=6)
df = DataFrame(np.arange(24).reshape((6, 4)),
               index=dates,
               columns=['a', 'b', 'c', 'd'])
df.iloc[0, 1] = np.nan
def excelFromPictures(path,picture):
    SecretId = ""
    SecretKey = ""     
    
    with open(picture,"rb") as f:
            img_data = f.read()
    img_base64 = b64encode(img_data)
    cred = credential.Credential(SecretId, SecretKey)  #ID和Secret从腾讯云申请
    httpProfile = HttpProfile()
    httpProfile.endpoint = "ocr.tencentcloudapi.com"

    clientProfile = ClientProfile()
    clientProfile.httpProfile = httpProfile
    client = ocr_client.OcrClient(cred, "ap-shanghai", clientProfile)

    req = models.TableOCRRequest()
    params = '{"ImageBase64":"' + str(img_base64, 'utf-8') + '"}'
    req.from_json_string(params)
#    false=0
    try:

        resp = client.TableOCR(req)
        #     print(resp.to_json_string())

    except TencentCloudSDKException as err:
        print("错误[",err,"]\n可重试")
        


    ##提取识别出的数据,并且生成json
    result1 = loads(resp.to_json_string())

    #RowTl表示数据所有行索引,ColTl表示数据所在列索引,Text为数据
    rowIndex = []
    colIndex = []
    content = []

    for item in result1['TextDetections']:
        rowIndex.append(item['RowTl'])
        colIndex.append(item['ColTl'])
        content.append(item['Text'])

    ##导出Excel
    ##ExcelWriter方案
    rowIndex = Series(rowIndex)
    colIndex = Series(colIndex)

    index = rowIndex.unique()
    index.sort()

    columns = colIndex.unique()
    columns.sort()

    data = DataFrame(index = index, columns = columns)
    for i in range(len(rowIndex)):
        data.loc[rowIndex[i],colIndex[i]] = re.sub(" ","",content[i])

    writer = ExcelWriter(path+"/tables/" +re.match(".*\.",f.name).group()+"xlsx", engine='xlsxwriter')
    data.to_excel(writer,sheet_name = 'Sheet1', index=False,header = False)
    writer.save()
    
    print("已经完成" + f.name + "的提取")
Example #11
0
# ### 값 추출

# #### Series
# #### unique

# In[79]:


f = Series(list("가나라다다나라다"))
f


# In[80]:


f.unique()


# In[81]:


tmp = f.unique()
print("정렬 전", tmp)
tmp.sort()
print("정렬 후", tmp)


# In[82]:


f.duplicated()
Example #12
0
from pandas import DataFrame,Series
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

y_df2 = pd.read_csv('data/759funds.csv',index_col=0)
fund_names2 = Series(y_df2.columns)
fund_names = fund_names2.unique()
ind = y_df2.index


# funds = {}
# for name in fund_names:
# 	funds[name] = []
# 	f = open('all_output/fund_'+name+'_beta.txt')
# 	cols=-1
# 	for line in f.readlines():
# 		if line[0]=='b':
# 			funds[name].append([])
# 			cols=cols+1
# 		else:
# 			funds[name][cols].append(float(line))
# 	f.close()

alphas = {}
for name in fund_names:
	alphas[name]=[]
	f = open('all_output/fund_'+name+'_alpha_median_ma6.txt')
	for line in f.readlines():
		alphas[name].append(float(line))
	f.close()
Example #13
0
    def test_value_counts_bins(self):
        klasses = [Index, Series]
        for klass in klasses:
            s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a']
            s = klass(s_values)

            # bins
            pytest.raises(TypeError, lambda bins: s.value_counts(bins=bins), 1)

            s1 = Series([1, 1, 2, 3])
            res1 = s1.value_counts(bins=1)
            exp1 = Series({Interval(0.997, 3.0): 4})
            tm.assert_series_equal(res1, exp1)
            res1n = s1.value_counts(bins=1, normalize=True)
            exp1n = Series({Interval(0.997, 3.0): 1.0})
            tm.assert_series_equal(res1n, exp1n)

            if isinstance(s1, Index):
                tm.assert_index_equal(s1.unique(), Index([1, 2, 3]))
            else:
                exp = np.array([1, 2, 3], dtype=np.int64)
                tm.assert_numpy_array_equal(s1.unique(), exp)

            assert s1.nunique() == 3

            # these return the same
            res4 = s1.value_counts(bins=4, dropna=True)
            intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0])
            exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2]))
            tm.assert_series_equal(res4, exp4)

            res4 = s1.value_counts(bins=4, dropna=False)
            intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0])
            exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2]))
            tm.assert_series_equal(res4, exp4)

            res4n = s1.value_counts(bins=4, normalize=True)
            exp4n = Series([0.5, 0.25, 0.25, 0],
                           index=intervals.take([0, 3, 1, 2]))
            tm.assert_series_equal(res4n, exp4n)

            # handle NA's properly
            s_values = ['a', 'b', 'b', 'b', np.nan, np.nan,
                        'd', 'd', 'a', 'a', 'b']
            s = klass(s_values)
            expected = Series([4, 3, 2], index=['b', 'a', 'd'])
            tm.assert_series_equal(s.value_counts(), expected)

            if isinstance(s, Index):
                exp = Index(['a', 'b', np.nan, 'd'])
                tm.assert_index_equal(s.unique(), exp)
            else:
                exp = np.array(['a', 'b', np.nan, 'd'], dtype=object)
                tm.assert_numpy_array_equal(s.unique(), exp)
            assert s.nunique() == 3

            s = klass({})
            expected = Series([], dtype=np.int64)
            tm.assert_series_equal(s.value_counts(), expected,
                                   check_index_type=False)
            # returned dtype differs depending on original
            if isinstance(s, Index):
                tm.assert_index_equal(s.unique(), Index([]), exact=False)
            else:
                tm.assert_numpy_array_equal(s.unique(), np.array([]),
                                            check_dtype=False)

            assert s.nunique() == 0
    ["CVX", "XOM", "BP"], start=datetime.datetime(2010, 1, 1), end=datetime.datetime(2013, 1, 1)
)["Adj Close"]
prices.head()

volume = pdweb.get_data_yahoo(
    ["CVX", "XOM", "BP"], start=datetime.datetime(2010, 1, 1), end=datetime.datetime(2013, 1, 1)
)["Volume"]
volume.head()

rets = prices.pct_change()

# Correction of the stocks
rcorr = rets.corr

prices.plot()
volume.plot()

import seaborn as sns
import matplotlib.pyplot as plt

# seaborn correlation plot between pct change in stock price
sns.corrplot(rets, annot=False, diag_names=False)

prices.cov  # covariance method

# unique values of a series
ser1 = Series(["w", "w", "x", "y", "z", "w", "x", "y", "x", "a"])
ser1.unique()

ser1.value_counts()
Example #15
0
    def fit(self, X: pd.DataFrame, y: pd.Series):
        """
        Learns the numbers that should be used to replace the categories in each
        variable. That is the WoE.


        Parameters
        ----------

        X : pandas dataframe of shape = [n_samples, n_features]
            The training input samples.
            Can be the entire dataframe, not just the categorical variables.

        y : pandas series.
            Target, must be binary [0,1].

        Attributes
        ----------

        encoder_dict_: dictionary
            The dictionary containing the {category: WoE} pairs per variable.
        """

        X = self._check_fit_input_and_variables(X)

        # check that y is binary
        if any(x for x in y.unique() if x not in [0, 1]):
            raise ValueError(
                "This encoder is only designed for binary classification, values of y "
                "can be only 0 or 1.")

        temp = pd.concat([X, y], axis=1)
        temp.columns = list(X.columns) + ["target"]

        self.encoder_dict_ = {}

        total_pos = temp["target"].sum()
        total_neg = len(temp) - total_pos
        temp["non_target"] = np.where(temp["target"] == 1, 0, 1)

        for var in self.variables:
            pos = temp.groupby([var])["target"].sum() / total_pos
            neg = temp.groupby([var])["non_target"].sum() / total_neg

            t = pd.concat([pos, neg], axis=1)
            t["woe"] = np.log(t["target"] / t["non_target"])

            if (not t.loc[t["target"] == 0, :].empty
                    or not t.loc[t["non_target"] == 0, :].empty):
                raise ValueError(
                    "The proportion of one of the classes for a category in "
                    "variable {} is zero, and log of zero is not defined".
                    format(var))

            self.encoder_dict_[var] = t["woe"].to_dict()

        self._check_encoding_dictionary()

        self.input_shape_ = X.shape

        return self
    def test_value_counts_inferred(self):
        klasses = [Index, Series]
        for klass in klasses:
            s_values = ["a", "b", "b", "b", "b", "c", "d", "d", "a", "a"]
            s = klass(s_values)
            expected = Series([4, 3, 2, 1], index=["b", "a", "d", "c"])
            tm.assert_series_equal(s.value_counts(), expected)

            self.assert_numpy_array_equal(s.unique(), np.unique(s_values))
            self.assertEqual(s.nunique(), 4)
            # don't sort, have to sort after the fact as not sorting is
            # platform-dep
            hist = s.value_counts(sort=False).sort_values()
            expected = Series([3, 1, 4, 2], index=list("acbd")).sort_values()
            tm.assert_series_equal(hist, expected)

            # sort ascending
            hist = s.value_counts(ascending=True)
            expected = Series([1, 2, 3, 4], index=list("cdab"))
            tm.assert_series_equal(hist, expected)

            # relative histogram.
            hist = s.value_counts(normalize=True)
            expected = Series([0.4, 0.3, 0.2, 0.1], index=["b", "a", "d", "c"])
            tm.assert_series_equal(hist, expected)

            # bins
            self.assertRaises(TypeError, lambda bins: s.value_counts(bins=bins), 1)

            s1 = Series([1, 1, 2, 3])
            res1 = s1.value_counts(bins=1)
            exp1 = Series({0.998: 4})
            tm.assert_series_equal(res1, exp1)
            res1n = s1.value_counts(bins=1, normalize=True)
            exp1n = Series({0.998: 1.0})
            tm.assert_series_equal(res1n, exp1n)

            self.assert_numpy_array_equal(s1.unique(), np.array([1, 2, 3]))
            self.assertEqual(s1.nunique(), 3)

            res4 = s1.value_counts(bins=4)
            exp4 = Series({0.998: 2, 1.5: 1, 2.0: 0, 2.5: 1}, index=[0.998, 2.5, 1.5, 2.0])
            tm.assert_series_equal(res4, exp4)
            res4n = s1.value_counts(bins=4, normalize=True)
            exp4n = Series({0.998: 0.5, 1.5: 0.25, 2.0: 0.0, 2.5: 0.25}, index=[0.998, 2.5, 1.5, 2.0])
            tm.assert_series_equal(res4n, exp4n)

            # handle NA's properly
            s_values = ["a", "b", "b", "b", np.nan, np.nan, "d", "d", "a", "a", "b"]
            s = klass(s_values)
            expected = Series([4, 3, 2], index=["b", "a", "d"])
            tm.assert_series_equal(s.value_counts(), expected)

            self.assert_numpy_array_equal(s.unique(), np.array(["a", "b", np.nan, "d"], dtype="O"))
            self.assertEqual(s.nunique(), 3)

            s = klass({})
            expected = Series([], dtype=np.int64)
            tm.assert_series_equal(s.value_counts(), expected, check_index_type=False)
            self.assert_numpy_array_equal(s.unique(), np.array([]))
            self.assertEqual(s.nunique(), 0)

            # GH 3002, datetime64[ns]
            # don't test names though
            txt = "\n".join(
                [
                    "xxyyzz20100101PIE",
                    "xxyyzz20100101GUM",
                    "xxyyzz20100101EGG",
                    "xxyyww20090101EGG",
                    "foofoo20080909PIE",
                    "foofoo20080909GUM",
                ]
            )
            f = StringIO(txt)
            df = pd.read_fwf(f, widths=[6, 8, 3], names=["person_id", "dt", "food"], parse_dates=["dt"])

            s = klass(df["dt"].copy())
            s.name = None

            idx = pd.to_datetime(["2010-01-01 00:00:00Z", "2008-09-09 00:00:00Z", "2009-01-01 00:00:00X"])
            expected_s = Series([3, 2, 1], index=idx)
            tm.assert_series_equal(s.value_counts(), expected_s)

            expected = np_array_datetime64_compat(
                ["2010-01-01 00:00:00Z", "2009-01-01 00:00:00Z", "2008-09-09 00:00:00Z"], dtype="datetime64[ns]"
            )
            if isinstance(s, DatetimeIndex):
                expected = DatetimeIndex(expected)
                self.assertTrue(s.unique().equals(expected))
            else:
                self.assert_numpy_array_equal(s.unique(), expected)

            self.assertEqual(s.nunique(), 3)

            # with NaT
            s = df["dt"].copy()
            s = klass([v for v in s.values] + [pd.NaT])

            result = s.value_counts()
            self.assertEqual(result.index.dtype, "datetime64[ns]")
            tm.assert_series_equal(result, expected_s)

            result = s.value_counts(dropna=False)
            expected_s[pd.NaT] = 1
            tm.assert_series_equal(result, expected_s)

            unique = s.unique()
            self.assertEqual(unique.dtype, "datetime64[ns]")

            # numpy_array_equal cannot compare pd.NaT
            self.assert_numpy_array_equal(unique[:3], expected)
            self.assertTrue(unique[3] is pd.NaT or unique[3].astype("int64") == pd.tslib.iNaT)

            self.assertEqual(s.nunique(), 3)
            self.assertEqual(s.nunique(dropna=False), 4)

            # timedelta64[ns]
            td = df.dt - df.dt + timedelta(1)
            td = klass(td, name="dt")

            result = td.value_counts()
            expected_s = Series([6], index=[Timedelta("1day")], name="dt")
            tm.assert_series_equal(result, expected_s)

            expected = TimedeltaIndex(["1 days"])
            if isinstance(td, TimedeltaIndex):
                self.assertTrue(td.unique().equals(expected))
            else:
                self.assert_numpy_array_equal(td.unique(), expected.values)

            td2 = timedelta(1) + (df.dt - df.dt)
            td2 = klass(td2, name="dt")
            result2 = td2.value_counts()
            tm.assert_series_equal(result2, expected_s)
    l = l.value_counts()

    print "Question:3."
    print "-----------"
    print "During what hour was the server the busiest in terms of requests?"
    print "Answer:"
    print "-------"
    print "The MAXIMUM number of requests were made in the hour '%s'.\nIn this hour, a total of %d requests were made" % (l.idxmax(),l.max())

    l = HTTP_DF[HTTP_DF['url'].str.contains('.gif',case=False)]['url']
    l = l.value_counts()
    print "\n"
  
    print "Question:4."
    print "-----------"
    print "Which .gif image was downloaded the most during the day?"
    print "Answer:"
    print "-------"
    print "The MAXIMUM number of downloads were made for the image '%s'.\nThis image was downloaded %d times" % (l.idxmax(),l.max())    

    l = HTTP_DF[HTTP_DF['retcode'] != 200]['retcode']
    
    print "\n"
    print "Question:5."
    print "-----------"
    print "What HTTP reply codes were sent other than 200?"
    print "Answer:"
    print "-------"
    print "The following return codes (other than 200) wrere sent:"
    print l.unique()
5     Mexico
6     Canada
7     Canada
8     Canada
9     Canada
10    Canada
11       NaN
12       NaN
13       NaN
14       NaN
dtype: object
'''

ser1.drop('b') # drops index 'b' and its associated value

ser1.unique() # returns unique values within a series
ser1.value_counts() # returns counts of values in a Series
'''
w    3
y    2
a    1
z    1
x    1
dtype: int64
'''

# hierarchical indexes are illustrated by these examples

ser = Series(randn(6), index = [[1,1,1,2,2,2],['a','b','c','a','b','c']])
'''
1  a    0.187640
# -*- coding: utf-8 -*- 

import numpy as np
import pandas as pd
from pandas import Series, DataFrame

print '去重'
obj = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
print obj.unique()
print obj.value_counts()
print

print '判断元素存在'
mask = obj.isin(['b', 'c'])
print mask
print obj[mask] #只打印元素b和c
data = DataFrame({'Qu1':[1, 3, 4, 3, 4],
                  'Qu2':[2, 3, 1, 2, 3],
                  'Qu3':[1, 5, 2, 4, 4]})
print data
print data.apply(pd.value_counts).fillna(0)
print data.apply(pd.value_counts, axis = 1).fillna(0)
Example #20
0
a = s.values

# get index
i = s.index

# assign name
s.name = 'name'

# length
assert len(s) == s.size == s.shape[0]

# number of element that a not NaN
s.count()

# get a array of unique values
s.unique()

# count(*) group by non-NaN value, get a Series
s.value_counts()

# aggregation and statistic
s.max()
s.mean()
s.var()

# location of the max element
s.idxmax()

# rank
s = Series([4, 1, 2, 5])
s.rank()                     # return [3,1,2,4]
Example #21
0
def count_result(ser: pd.Series):
    return ser.unique(), ser.value_counts()
Example #22
0
def value_encoder(labels: pd.Series) -> Dict[Any, int]:
    ret = {}
    uniques = labels.unique()
    for index, label in enumerate(uniques):
        ret[label] = index
    return ret
Example #23
0
price
volume = DataFrame({tic: data['Volume'] 
for tic, data in all_data.iteritems()})
# percent changes of the prices:
returns = price.pct_change()
returns.tails()
returns.tail()
returns.MSFT.corr(returns.IBM) # correlation of the overlapping non-NA
returns.MSFT.cov(returns.IBM) # covariance of the overlapping non-NA
returns.corr()
returns.cov()
returns.corrwith(returns.IBM)
returns.corrwith(volume)
## Unique values, Value counts, and membership
obj = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
uniques = obj.unique()
uniques
obj.value_counts()
obj.value_counts() # value frequencies
from pandas import value_counts
value_counts(obj.values, sort=False)
obj
mask = obj.isin(['b', 'c'])
obj[mask]
mask
data = DataFrame({'Qu1': [1, 3, 4, 3, 4],
'Qu2': [2, 3, 1, 2, 3],
'Qu3': [1, 5, 2, 4, 4]})
data
data.Qu1
data.Qu1.value_counts
Example #24
0
    def generate_arb_id_dictionary(self,
                                   a_timer: PipelineTimer,
                                   normalize_strategy: Callable,
                                   time_conversion: int = 1000,
                                   freq_analysis_accuracy: float = 0.0,
                                   freq_synchronous_threshold: float = 0.0,
                                   force: bool = False) -> (dict, dict):
        id_dictionary = {}
        j1979_dictionary = {}

        if force:
            # Remove any existing pickled Arb ID and J1979 dictionaries and create new ones based on data_filename.
            if path.isfile(self.id_output_filename):
                remove(self.id_output_filename)
            if path.isfile(self.j1979_output_filename):
                remove(self.j1979_output_filename)
            self.import_csv(a_timer, self.data_filename)
        elif path.isfile(self.id_output_filename):
            # This logic assumes that there will be a J1979 dict if and only if there is an Arb ID dict
            print("\tLoading Arb ID dictionary from pickled data: " +
                  getcwd() + "\\" + self.id_output_filename)
            id_dictionary = load(open(self.id_output_filename, "rb"))
            if path.isfile(self.j1979_output_filename):
                print("\tLoading J1979 dictionary from pickled data: " +
                      getcwd() + "\\" + self.j1979_output_filename)
                j1979_dictionary = load(open(self.j1979_output_filename, "rb"))
            print(
                "\tSet 'force_pre_processing' in Sample.py to True to re-compute instead..."
            )
            return id_dictionary, j1979_dictionary
        else:
            self.import_csv(a_timer, self.data_filename)

        a_timer.start_function_time()

        for arb_id in Series.unique(self.data['id']):
            if isinstance(arb_id, int64):
                if arb_id == 2015:
                    # This is the J1979 requests (if any) (ID 0x7DF = 2015). Just ignore it.
                    continue
                elif arb_id == 2024 and self.use_j1979:
                    # This is the J1979 responses (ID 0x7DF & 0x8 = 0x7E8 = 2024)
                    j1979_data = self.data.loc[self.data['id'] ==
                                               arb_id].copy()
                    j1979_data.drop('dlc', axis=1, inplace=True)
                    j1979_data.drop('id', axis=1, inplace=True)
                    a_timer.start_nested_function_time()
                    j1979_dictionary = self.generate_j1979_dictionary(
                        j1979_data)
                    a_timer.set_j1979_creation()
                elif arb_id > 0:
                    a_timer.start_iteration_time()

                    this_id = ArbID(arb_id)
                    this_id.original_data = self.data.loc[self.data['id'] ==
                                                          arb_id]
                    this_id.original_data = this_id.original_data.copy(
                    )  # type: DataFrame

                    # Check if the Arbitration ID always used the same DLC. If not, ignore it.
                    # We can effectively ignore this Arb ID by not adding it to the Arb ID dictionary.
                    if this_id.original_data['dlc'].nunique() is not 1:
                        continue
                    this_id.dlc = this_id.original_data['dlc'].iloc[0]
                    this_id.original_data.drop('dlc', axis=1, inplace=True)
                    this_id.original_data.drop('id', axis=1, inplace=True)

                    # If DLC < 8, we can automatically drop data column vectors > DLC.
                    # E.G. drop bytes "B7" and "B6" if DLC is 6; those are padding data injected by can-dump and were
                    # not actually on the bus.
                    if this_id.dlc < 8:
                        for i in range(this_id.dlc, 8):
                            this_id.original_data.drop('b' + str(i),
                                                       axis=1,
                                                       inplace=True)

                    # Check if there are duplicate index values and correct them.
                    if not this_id.original_data.index.is_unique:
                        correction_mask = this_id.original_data.index.duplicated(
                        )
                        this_id.original_data = this_id.original_data[
                            ~correction_mask]

                    this_id.generate_binary_matrix_and_tang(
                        a_timer, normalize_strategy)
                    this_id.analyze_transmission_frequency(
                        time_convert=time_conversion,
                        ci_accuracy=freq_analysis_accuracy,
                        synchronous_threshold=freq_synchronous_threshold)
                    id_dictionary[arb_id] = this_id

                    a_timer.set_arb_id_creation()

        a_timer.set_raw_df_to_arb_id_dict()

        return id_dictionary, j1979_dictionary
'''
0     w
1     y
2     a
3     w
4     y
5     z
6     b
7     q
8     w
9     g
10    h
dtype: object
'''

print(ser1.unique())
'''
dtype: object
['w' 'y' 'a' 'z' 'b' 'q' 'g' 'h']
'''

# count values
print(ser1.value_counts())
'''
w    3
y    2
q    1
g    1
z    1
h    1
b    1
Example #26
0
# ## 12,唯一值,值计数和成员资格

# In[212]:

obj=Series(['c','a','d','a','a','b','d','d','c'])


# In[213]:

obj


# In[215]:

obj.unique()


# **value_counts返回某值出现的频率,默认降序。还是一个顶级的pandas方法,可用于任何数组或序列**

# In[216]:

obj.value_counts()


# In[217]:

pd.value_counts(obj.values,sort=False)


# **isin;判断矢量化集合的成员资格**
#基础统计功能
df7 = DataFrame([[1.4,np.nan],[7.1,-4.5],[np.nan,np.nan],[0.75,-1.3]],index=['a','b','c','d'],columns=['one','two'])
df7.sum()
df7.sum(axis=1)
#包含NaN的值,不进行平均值计算
df7.mean(axis=1,skipna=False)
#最大值所在的索引
df7.idxmax()
#返回累加和
df7.cumsum()
#返回多种统计集合的结果
df7.describe()

#唯一值和值计数
obj = Series(['c','a','d','a','a','b','b','c','c'])
unique = obj.unique()
obj.value_counts()
pd.value_counts(obj.values,sort=True)
mask = obj.isin(['b','c'])
obj[mask]

#缺失数据判断
data8 = Series(['a','b',np.nan,'d'])
data8.isnull()
data8[2] = None
data8.isnull()
data9 = Series([1,np.nan,2,np.nan])
#DataFrame中dropna的功能扩展
data9.dropna()
count    4.000000
mean     4.500000
std      2.516611
min      2.000000
25%      3.500000
50%      4.000000
75%      5.000000
max      8.000000
dtype: float64
'''

print
'去重'
obj = Series(['c', 'a', 'd', 'b', 'b', 'c'])
print
obj.unique()
print
obj.value_counts()
print

print
'判断元素存在'
mask = obj.isin(['b', 'c'])
print
mask
print
obj[mask]  # 只打印元素b和c
data = DataFrame({'Qu1': [1, 3, 4, 3, 4],
                  'Qu2': [2, 3, 1, 2, 3],
                  'Qu3': [1, 5, 2, 4, 4]})
print
Example #29
0
import matplotlib.pyplot as plt

array1 = np.array([[10, np.nan, 20], [30, 40, np.nan]])
print array1
df1 = DataFrame(array1, index=[1, 2], columns=list('ABC'))
print df1

#sum()
print df1.sum()  #sums along each column
print df1.sum(axis=1)  #sum along indexes

print df1.min()
print df1.max()

print df1.idxmax()
print df1.cumsum()
print df1.describe()

df2 = DataFrame(randn(9).reshape(3, 3), index=[1, 2, 3], columns=list('ABC'))
print df2

plt.plot(df2)
plt.legend(df2.columns, loc="lower right")
plt.savefig('samplepic.png')
plt.show()

ser1 = Series(list('abcccaabd'))
print ser1.unique()

print ser1.value_counts()
Example #30
0
print obj.describe() # 对Series计算汇总统计
'''
count    4.000000
mean     4.500000
std      2.516611
min      2.000000
25%      3.500000
50%      4.000000
75%      5.000000
max      8.000000
dtype: float64
'''

print '去重'
obj = Series(['c', 'a', 'd','b', 'b', 'c'])
print obj.unique()
print obj.value_counts()
print

print '判断元素存在'
mask = obj.isin(['b', 'c'])
print mask
print obj[mask] #只打印元素b和c
data = DataFrame({'Qu1':[1, 3, 4, 3, 4],
                  'Qu2':[2, 3, 1, 2, 3],
                  'Qu3':[1, 5, 2, 4, 4]})
print data
print data.apply(pd.value_counts).fillna(0) 
# 计算每列中各个数字出现的次数,缺失值为0
print data.apply(pd.value_counts, axis = 1).fillna(0)
# 计算每行中各个数字出现的次数,缺失值为0
# -*- coding: utf-8 -*- 

import numpy as np
import pandas as pd
from pandas import Series, DataFrame

print('去重')
obj = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
print(obj.unique())
print(obj.value_counts())
print()

print('判断元素存在')
mask = obj.isin(['b', 'c'])
print(mask)
print(obj[mask])  # 只打印元素b和c
data = DataFrame({'Qu1': [1, 3, 4, 3, 4],
                  'Qu2': [2, 3, 1, 2, 3],
                  'Qu3': [1, 5, 2, 4, 4]})
print(data)
print(data.apply(pd.value_counts).fillna(0))
print(data.apply(pd.value_counts, axis=1).fillna(0))
Example #32
0
def main():

    set_logger()
    logger = logging.getLogger('clusters.main')

    logger.info("Parsing arguments")
    file, clusters = get_args()

    logger.info("Load data and check for data consistency")
    df = pd.read_json(file)
    df = df.dropna(axis = 0, how='any')
    X = df.loc[:, ['lat', 'lng']].values
    logger.info("Performing KMeans clustering")
    kmeans = KMeans(n_clusters=clusters, max_iter=1000).fit(X)

    #Cluster metadata
    logger.info("Calculating cluster metadata")
    centers = {k: v for k, v in enumerate(kmeans.cluster_centers_)}
    logger.info("Counting number of crimes of each cluster")
    labels = Series(kmeans.labels_)
    num_labels = {}
    for l in labels.unique():
        num = labels[labels == l].count()
        num_labels[l] = num
    logger.debug("Number of occurrences of each label: {}".format(num_labels))

    logger.info("Transforming counting into percentage")
    total = labels.count()
    percentage = {k: v/total for k, v in num_labels.items()}
    logger.debug("Percentage of each label: {}".format(percentage))

    logger.info("Removing clusters with few points")
    labels_remove, lost_points = filter_clusters(percentage, X, labels)
    for l in labels_remove:
        percentage.pop(l, 'None')
        num_labels.pop(l, 'None')
        centers.pop(l, 'None')

    logger.debug("Number of occurrences of each label after filtering: {}".format(num_labels))
    logger.debug("Percentage of each label after filtering: {}".format(percentage))
    logger.debug("Number of filtered points: {}".format(lost_points.shape[0]))

    logger.info("Assign lost points to new clusters")
    new_labels = new_assignement(lost_points, centers)
    for l in new_labels:
        num_labels[l] += 1
    percentage = {k: v/total for k, v in num_labels.items()}

    logger.info("Save results in JSON")
    #Separate center into lat and lng
    centers_lat,  centers_lng= {}, {}
    for k, v in centers.items():
        centers_lat[k], centers_lng[k] = v[0], v[1]
    columns = [
        'Number of crimes',
        'Percentage of total crimes',
        'lat',
        'lng'
        ]
    df_meta = DataFrame({
        columns[0]: num_labels,
        columns[1]: percentage,
        columns[2]: centers_lat,
        columns[3]: centers_lng
    })
    _, basename = os.path.split(file)
    basename, _ = os.path.splitext(basename)
    df_meta.to_json(
        path_or_buf=basename+'Cluster.json',
        orient='records'
    )
Example #33
0
# 전체 행이나 컬럼의 값이 NA가 아니라면 NA 값은 제외시키고 계산을 하는데
# skipna 옵션은 전체 행이나 컬럼의 값이 NA가 아니라도 제외시키지 않을 수 있다.
# skipna의 기본값은 True
print(df.sum(axis=1, skipna=False))

# idxmin, idxmax와 같은 메서드는 최소, 최대값을 가지고 있는 색인 값 같은 간접 통계를 반환한다.
print(df.idxmax())
print(df.idxmin())

# 누산 메서드 : cumsum()
print(df.cumsum())

# unique() : 중복된 값을 하나로 묶음
s1 = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
print(s1.unique())

# value_counts() : 값의 수를 계산(도수, 카운팅), 반환값은 Series 객체
print(s1.value_counts())  # 결과값이 내림차순으로 출력됨.

# isin() : 어떤 값이 Series에 있는지 나타내는 메서드
## boolean type(True, False)을 반환한다.
mask = s1.isin(['b', 'c'])
print(mask)
print(s1[mask])  # 이런식으로 원하는 값만 뽑아낼 수 있다.

data = DataFrame({
    'Q1': [1, 3, 4, 3, 4],
    'Q2': [2, 3, 1, 2, 3],
    'Q3': [1, 5, 2, 4, 4]
})
# In[147]:

df.mean()

# In[148]:

df.idxmax()

# In[149]:

df.describe()

# In[151]:

obj8 = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
u = obj8.unique()
u

# In[155]:

obj8.value_counts()

# In[157]:

pd.value_counts(obj8.values, sort=False)

# In[158]:

mask = obj8.isin(['b', 'c'])
mask
Example #35
0
    def test_value_counts_inferred(self):
        klasses = [Index, Series]
        for klass in klasses:
            s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a']
            s = klass(s_values)
            expected = Series([4, 3, 2, 1], index=['b', 'a', 'd', 'c'])
            tm.assert_series_equal(s.value_counts(), expected)

            self.assert_numpy_array_equal(s.unique(), np.unique(s_values))
            self.assertEqual(s.nunique(), 4)
            # don't sort, have to sort after the fact as not sorting is platform-dep
            hist = s.value_counts(sort=False)
            hist.sort()
            expected = Series([3, 1, 4, 2], index=list('acbd'))
            expected.sort()
            tm.assert_series_equal(hist, expected)

            # sort ascending
            hist = s.value_counts(ascending=True)
            expected = Series([1, 2, 3, 4], index=list('cdab'))
            tm.assert_series_equal(hist, expected)

            # relative histogram.
            hist = s.value_counts(normalize=True)
            expected = Series([.4, .3, .2, .1], index=['b', 'a', 'd', 'c'])
            tm.assert_series_equal(hist, expected)

            # bins
            self.assertRaises(TypeError,
                              lambda bins: s.value_counts(bins=bins), 1)

            s1 = Series([1, 1, 2, 3])
            res1 = s1.value_counts(bins=1)
            exp1 = Series({0.998: 4})
            tm.assert_series_equal(res1, exp1)
            res1n = s1.value_counts(bins=1, normalize=True)
            exp1n = Series({0.998: 1.0})
            tm.assert_series_equal(res1n, exp1n)

            self.assert_numpy_array_equal(s1.unique(), np.array([1, 2, 3]))
            self.assertEqual(s1.nunique(), 3)

            res4 = s1.value_counts(bins=4)
            exp4 = Series({
                0.998: 2,
                1.5: 1,
                2.0: 0,
                2.5: 1
            },
                          index=[0.998, 2.5, 1.5, 2.0])
            tm.assert_series_equal(res4, exp4)
            res4n = s1.value_counts(bins=4, normalize=True)
            exp4n = Series({
                0.998: 0.5,
                1.5: 0.25,
                2.0: 0.0,
                2.5: 0.25
            },
                           index=[0.998, 2.5, 1.5, 2.0])
            tm.assert_series_equal(res4n, exp4n)

            # handle NA's properly
            s_values = [
                'a', 'b', 'b', 'b', np.nan, np.nan, 'd', 'd', 'a', 'a', 'b'
            ]
            s = klass(s_values)
            expected = Series([4, 3, 2], index=['b', 'a', 'd'])
            tm.assert_series_equal(s.value_counts(), expected)

            self.assert_numpy_array_equal(
                s.unique(), np.array(['a', 'b', np.nan, 'd'], dtype='O'))
            self.assertEqual(s.nunique(), 3)

            s = klass({})
            expected = Series([], dtype=np.int64)
            tm.assert_series_equal(s.value_counts(), expected)
            self.assert_numpy_array_equal(s.unique(), np.array([]))
            self.assertEqual(s.nunique(), 0)

            # GH 3002, datetime64[ns]
            txt = "\n".join([
                'xxyyzz20100101PIE', 'xxyyzz20100101GUM', 'xxyyzz20100101EGG',
                'xxyyww20090101EGG', 'foofoo20080909PIE', 'foofoo20080909GUM'
            ])
            f = StringIO(txt)
            df = pd.read_fwf(f,
                             widths=[6, 8, 3],
                             names=["person_id", "dt", "food"],
                             parse_dates=["dt"])

            s = klass(df['dt'].copy())

            idx = pd.to_datetime([
                '2010-01-01 00:00:00Z', '2008-09-09 00:00:00Z',
                '2009-01-01 00:00:00X'
            ])
            expected_s = Series([3, 2, 1], index=idx)
            tm.assert_series_equal(s.value_counts(), expected_s)

            expected = np.array([
                '2010-01-01 00:00:00Z', '2009-01-01 00:00:00Z',
                '2008-09-09 00:00:00Z'
            ],
                                dtype='datetime64[ns]')
            if isinstance(s, DatetimeIndex):
                expected = DatetimeIndex(expected)
                self.assertTrue(s.unique().equals(expected))
            else:
                self.assert_numpy_array_equal(s.unique(), expected)

            self.assertEqual(s.nunique(), 3)

            # with NaT
            s = df['dt'].copy()
            s = klass([v for v in s.values] + [pd.NaT])

            result = s.value_counts()
            self.assertEqual(result.index.dtype, 'datetime64[ns]')
            tm.assert_series_equal(result, expected_s)

            result = s.value_counts(dropna=False)
            expected_s[pd.NaT] = 1
            tm.assert_series_equal(result, expected_s)

            unique = s.unique()
            self.assertEqual(unique.dtype, 'datetime64[ns]')
            # numpy_array_equal cannot compare pd.NaT
            self.assert_numpy_array_equal(unique[:3], expected)
            self.assertTrue(unique[3] is pd.NaT
                            or unique[3].astype('int64') == pd.tslib.iNaT)

            self.assertEqual(s.nunique(), 3)
            self.assertEqual(s.nunique(dropna=False), 4)

            # timedelta64[ns]
            td = df.dt - df.dt + timedelta(1)
            td = klass(td)

            result = td.value_counts()
            expected_s = Series([6], index=[Timedelta('1day')])
            tm.assert_series_equal(result, expected_s)

            expected = TimedeltaIndex(['1 days'])
            if isinstance(td, TimedeltaIndex):
                self.assertTrue(td.unique().equals(expected))
            else:
                self.assert_numpy_array_equal(td.unique(), expected.values)

            td2 = timedelta(1) + (df.dt - df.dt)
            td2 = klass(td2)
            result2 = td2.value_counts()

            tm.assert_series_equal(result2, expected_s)
Example #36
0
def woe_analysis(df1, target, max_bin, force_bin):
    """
    wrapper function for mono_bin, char_bin, and woe_graph functions.
    This will automatically construct bins for each variable. For numerical
    variables, it will create bins such that the WOE relationship between bins 
    is monotonic.
    
    Parameters
    ----------
    
    df1 : pandas dataframe
        training dataset
        
    target : pandas series
        target vector
        
    max_bin : int
        the maximum number of bins (categories) for numeric variable binning. 
        
    force_bin : int
        For some numeric variables, the mono_bin function may produce only one 
        category while binning. ‘force_bin’ ensures that at least produces two
        categories will be produced.
        
    Return
    ------
    
    iv_df : pandas dataframe
        Weight of evidence / information value table and other data used to 
        calculate WOE and IV for variables in dataset
    
    iv : pandas dataframe
        Information value table for variables in dataset
    
    """
    max_bin = max_bin
    force_bin = force_bin
    stack = traceback.extract_stack()
    filename, lineno, function_name, code = stack[-2]
    vars_name = re.compile(r'\((.*?)\).*$').search(code).groups()[0]
    final = (re.findall(r"[\w']+", vars_name))[-1]

    x = df1.dtypes.index
    count = -1

    for i in x:
        if i.upper() not in (final.upper()):
            if np.issubdtype(df1[i],
                             np.number) and len(Series.unique(df1[i])) > 2:
                conv = mono_bin(target, df1[i], max_bin, force_bin)
                conv["VAR_NAME"] = i
                count = count + 1
                woe_graph(conv, True)
            else:
                conv = char_bin(target, df1[i])
                conv["VAR_NAME"] = i
                count = count + 1
                conv = conv.sort_values(by='WOE', ascending=False)
                woe_graph(conv, False)
            if count == 0:
                iv_df = conv
            else:
                iv_df = iv_df.append(conv, ignore_index=True)

    iv = pd.DataFrame({'IV': iv_df.groupby('VAR_NAME').IV.max()})
    iv = iv.reset_index()
    return iv_df, iv
print(df1.sum(axis=1))

print(df1.min())
print(df1.max())

print('----------')

print(df1.idxmax())
print(df1.idxmin())
print('----------')

print(df1.cumsum())
print('----------')

print(df1.describe())
print('----------')

df2 = DataFrame(randn(9).reshape(3, 3), index=[1, 2, 3], columns=list('ABC'))
print(df2)
print('----------')

plt.plot(df2)
plt.legend(df2.columns, loc="lower right")
plt.savefig("first graph in python")
plt.show()

ser1 = Series(list('abcccaabd'))
print(ser1.unique())

print(ser1.value_counts())
Example #38
0
def woe_conversion(df, woe):
    """
    Converts the values of each variable for each borrower from its original
    value into the weight of evidence (WOE) values of the variable bin that 
    the input value is in.
    
    Parameters
    ----------
    
    df : pandas dataframe
        Cleaned explanatory variable training / testing / validation data frame
        that will be used to fit the model.
        
    woe : pandas dataframe
        WOE / IV table that is output from woe_analysis() function
        
    Return
    ------
    
    df_copy : pandas dataframe
        Converted dataframe, from original input values to corresponding WOE values
        
    """
    df_copy = df.copy()
    woe_df = woe.copy()

    var_list = list(df_copy)

    for i in range(0, len(var_list)):
        var_str = "'%s'" % var_list[i]
        var_woe = woe_df.loc[woe_df['VAR_NAME'] == var_list[i]].copy()

        var_woe['max_range'] = var_woe['MAX_VALUE']

        if np.issubdtype(df_copy[var_list[i]], np.number) and\
        (len(Series.unique(df_copy[var_list[i]])) > 2):
            var_woe['min_range'] = var_woe.groupby(
                'VAR_NAME')['MAX_VALUE'].shift(1)
            var_woe.loc[var_woe['MIN_VALUE'].isnull(), 'min_range'].isnull()
            var_woe.loc[var_woe['min_range'].isnull(),
                        'min_range'] = var_woe['MIN_VALUE']
        else:
            var_woe['min_range'] = var_woe['MIN_VALUE']

        var_woe_clean = var_woe[var_woe['MIN_VALUE'].notnull()]
        var_woe_null = var_woe[var_woe['MIN_VALUE'].isnull()]

        if not var_woe_null.empty:
            woe_null = var_woe_null.iloc[0]['WOE']
        else:
            woe_null = np.nan

        min_value_list = var_woe_clean['min_range'].tolist()
        max_value_list = var_woe_clean['max_range'].tolist()
        choices = var_woe_clean['WOE'].tolist()

        cond_str_list = []

        N = len(min_value_list)

        for j in range(0, len(min_value_list)):

            #condition for binary indicator variables
            if np.issubdtype(df_copy[var_list[i]], np.number) and \
            (len(Series.unique(df_copy[var_list[i]])) == 2) and \
            min_value_list[j] == max_value_list[j]:
                com_str = "(df_copy[" + var_str + "] ==" + str(
                    min_value_list[j]) + ")"

            elif np.issubdtype(df_copy[var_list[i]], np.number):
                if j == 0:
                    com_str = "(df_copy[" + var_str + "] <=" + str(
                        max_value_list[j]) + ")"
                elif j == (N - 1):
                    com_str = "(df_copy[" + var_str + "] >" + str(
                        min_value_list[j]) + ")"
                else:
                    com_str = "(df_copy[" + var_str + "] >" + str(
                        min_value_list[j]
                    ) + ") & (df_copy[" + var_str + "] <=" + str(
                        max_value_list[j]) + ")"

            else:
                char_str = "'%s'" % min_value_list[j]
                com_str = "(df_copy[" + var_str + "] ==" + char_str + ")"

            cond_str_list.append(com_str)

        full_conds = ','.join(cond_str_list)

        conditions = eval(full_conds)
        var_woe_label = var_list[i] + '_woe'

        df_copy[var_woe_label] = np.select(conditions,
                                           choices,
                                           default=woe_null)
        df_copy = df_copy.drop([var_list[i]], axis=1)
    return df_copy
Example #39
0
def infer_problem_type(y: Series, silent=False) -> str:
    """ Identifies which type of prediction problem we are interested in (if user has not specified).
        Ie. binary classification, multi-class classification, or regression.
    """
    if len(y) == 0:
        raise ValueError("provided labels cannot have length = 0")
    y = y.dropna(
    )  # Remove missing values from y (there should not be any though as they were removed in Learner.general_data_processing())
    num_rows = len(y)

    unique_values = y.unique()

    MULTICLASS_LIMIT = 1000  # if numeric and class count would be above this amount, assume it is regression
    if num_rows > 1000:
        REGRESS_THRESHOLD = 0.05  # if the unique-ratio is less than this, we assume multiclass classification, even when labels are integers
    else:
        REGRESS_THRESHOLD = 0.1

    unique_count = len(unique_values)
    if unique_count == 2:
        problem_type = BINARY
        reason = "only two unique label-values observed"
    elif y.dtype.name in ['object', 'category', 'string']:
        problem_type = MULTICLASS
        reason = f"dtype of label-column == {y.dtype.name}"
    elif np.issubdtype(y.dtype, np.floating):
        unique_ratio = unique_count / float(num_rows)
        if (unique_ratio <= REGRESS_THRESHOLD) and (unique_count <=
                                                    MULTICLASS_LIMIT):
            try:
                can_convert_to_int = np.array_equal(y, y.astype(int))
                if can_convert_to_int:
                    problem_type = MULTICLASS
                    reason = "dtype of label-column == float, but few unique label-values observed and label-values can be converted to int"
                else:
                    problem_type = REGRESSION
                    reason = "dtype of label-column == float and label-values can't be converted to int"
            except:
                problem_type = REGRESSION
                reason = "dtype of label-column == float and label-values can't be converted to int"
        else:
            problem_type = REGRESSION
            reason = "dtype of label-column == float and many unique label-values observed"
    elif np.issubdtype(y.dtype, np.integer):
        unique_ratio = unique_count / float(num_rows)
        if (unique_ratio <= REGRESS_THRESHOLD) and (unique_count <=
                                                    MULTICLASS_LIMIT):
            problem_type = MULTICLASS  # TODO: Check if integers are from 0 to n-1 for n unique values, if they have a wide spread, it could still be regression
            reason = "dtype of label-column == int, but few unique label-values observed"
        else:
            problem_type = REGRESSION
            reason = "dtype of label-column == int and many unique label-values observed"
    else:
        raise NotImplementedError(f'label dtype {y.dtype} not supported!')
    if not silent:
        logger.log(
            25,
            f"AutoGluon infers your prediction problem is: '{problem_type}' (because {reason})."
        )

        # TODO: Move this outside of this function so it is visible even if problem type was not inferred.
        if problem_type in [BINARY, MULTICLASS]:
            if unique_count > 10:
                logger.log(
                    20,
                    f'\tFirst 10 (of {unique_count}) unique label values:  {list(unique_values[:10])}'
                )
            else:
                logger.log(
                    20,
                    f'\t{unique_count} unique label values:  {list(unique_values)}'
                )
        elif problem_type == REGRESSION:
            y_max = y.max()
            y_min = y.min()
            y_mean = y.mean()
            y_stddev = y.std()
            logger.log(
                20,
                f'\tLabel info (max, min, mean, stddev): ({y_max}, {y_min}, {round(y_mean, 5)}, {round(y_stddev, 5)})'
            )

        logger.log(
            25,
            f"\tIf '{problem_type}' is not the correct problem_type, please manually specify the problem_type parameter during predictor init "
            f"(You may specify problem_type as one of: {[BINARY, MULTICLASS, REGRESSION]})"
        )
    return problem_type
Example #40
0
def create_mappings(x: pd.Series):
    labels = x.unique()
    lbl2idx = {label: idx for idx, label in enumerate(labels)}
    return lbl2idx, labels
Example #41
0
    def test_value_counts_bins(self, klass):
        s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a']
        s = klass(s_values)

        # bins
        with pytest.raises(TypeError):
            s.value_counts(bins=1)

        s1 = Series([1, 1, 2, 3])
        res1 = s1.value_counts(bins=1)
        exp1 = Series({Interval(0.997, 3.0): 4})
        tm.assert_series_equal(res1, exp1)
        res1n = s1.value_counts(bins=1, normalize=True)
        exp1n = Series({Interval(0.997, 3.0): 1.0})
        tm.assert_series_equal(res1n, exp1n)

        if isinstance(s1, Index):
            tm.assert_index_equal(s1.unique(), Index([1, 2, 3]))
        else:
            exp = np.array([1, 2, 3], dtype=np.int64)
            tm.assert_numpy_array_equal(s1.unique(), exp)

        assert s1.nunique() == 3

        # these return the same
        res4 = s1.value_counts(bins=4, dropna=True)
        intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0])
        exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2]))
        tm.assert_series_equal(res4, exp4)

        res4 = s1.value_counts(bins=4, dropna=False)
        intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0])
        exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2]))
        tm.assert_series_equal(res4, exp4)

        res4n = s1.value_counts(bins=4, normalize=True)
        exp4n = Series([0.5, 0.25, 0.25, 0],
                       index=intervals.take([0, 3, 1, 2]))
        tm.assert_series_equal(res4n, exp4n)

        # handle NA's properly
        s_values = [
            'a', 'b', 'b', 'b', np.nan, np.nan, 'd', 'd', 'a', 'a', 'b'
        ]
        s = klass(s_values)
        expected = Series([4, 3, 2], index=['b', 'a', 'd'])
        tm.assert_series_equal(s.value_counts(), expected)

        if isinstance(s, Index):
            exp = Index(['a', 'b', np.nan, 'd'])
            tm.assert_index_equal(s.unique(), exp)
        else:
            exp = np.array(['a', 'b', np.nan, 'd'], dtype=object)
            tm.assert_numpy_array_equal(s.unique(), exp)
        assert s.nunique() == 3

        s = klass({})
        expected = Series([], dtype=np.int64)
        tm.assert_series_equal(s.value_counts(),
                               expected,
                               check_index_type=False)
        # returned dtype differs depending on original
        if isinstance(s, Index):
            tm.assert_index_equal(s.unique(), Index([]), exact=False)
        else:
            tm.assert_numpy_array_equal(s.unique(),
                                        np.array([]),
                                        check_dtype=False)

        assert s.nunique() == 0
Example #42
0
def con_column_analysis(con_column: Series):
    print(con_column.describe())
    print("There are " + con_column.isnull().sum() + " nan values.")
    print("Null values accounts for  %2f".format(con_column.isnull().sum() /
                                                 len(con_column)))
    print("This column has " + con_column.unique().__len__() + "items.")
Example #43
0
}
f = DataFrame(dic,index=np.arange(100,80,-1))
f2 = DataFrame({'line03':np.linspace(30,35,10),'line04':np.arange(10)},index=np.arange(100,90,-1));
f3 = f.add(f2)
# 求和
# 求每一列的和
sum = f.sum()
# 求指定列的和
sum1 = f[['line01','line02']].sum()
# 求每一行的和
sum2 = f.sum(axis=1)
# print sum
# print sum1
# print sum2
# 若该行或列有NaN则运行结果为NaN,默认skipna为True,忽略NaN
sum4 = f3.sum(skipna=False)
# print sum4

# 获取所有值
S2 = Series(['c','d','a','c','c','c','r','a','d'])
uniques = S2.unique()
# print uniques
# 获取每个值出现的次数
uniques_counts = S2.value_counts()
# print uniques_counts

# 获取DataFrame中多个列出现的信息
result = f.apply(pd.value_counts).fillna(0)
print result

Example #44
0
def cat_column_analysis(cat_column: Series):
    print(cat_column.describe())
    print("There are " + cat_column.isnull().sum() + " nan values")
    print("Null values accounts for  %2f".format(cat_column.isnull().sum() /
                                                 len(cat_column)))
    print("In this column, there are ", cat_column.unique(), " items")
def main():
    """
    Calculation and aggregation of summary statistics
    """

    # Summary of statistics
    # return is not ndarray
    df = DataFrame([[1.4, np.nan],
                    [7.1, -4.5],
                    [np.nan, np.nan],
                    [0.75, -1.3]],
                   index=list('abcd'),
                   columns=['one', 'two'])
    print df
    print df.sum()
    print df.sum(axis=1)
    print df.mean(axis=1) # exclude nan
    print df.mean(axis=1, skipna=False)
    print df.idxmin()
    print df.idxmax()
    print df.cumsum()
    print df.describe()
    # values are not number
    obj = Series(list('aabc') * 4)
    print obj.describe()


    methods = ['count', 'min', 'max', # 'argmin', 'argmax',
               'quantile', 'median', 'mad', 'var', 'std',
               'skew', 'kurt', 'cummin', 'cummax', 'cumprod',
               'diff', 'pct_change']

    for method in methods:
        print u'「{0}」'.format(method)
        print getattr(df, method)()
        print ''

    # Correspond and Covariance
    all_data = {}
    lst = [] # ['AAPL', 'IBM', 'MSFT'] #, 'GOOG']:
    for ticket in lst: #, 'GOOG']:
        # IOError: after 3 tries, Yahoo! did not return a 200
        # for url 'http://ichart.finance.yahoo.com/table.csv?s=GOOG&a=0&b=1&c=2000&d=0&e=1&f=2010&g=d&ignore=.csv'
        all_data[ticket] = pd.io.data.get_data_yahoo(ticket, '1/1/2000', '1/1/2010')
    price = DataFrame({tic: data['Adj Close'] for tic, data in all_data.iteritems()})
    volume = DataFrame({tic: data['Volume'] for tic, data in all_data.iteritems()})
    if all_data:
        returns = price.pct_change()
        print returns.tail()
        print ''
        print returns.MSFT.corr(returns.IBM)
        print returns.MSFT.cov(returns.IBM)
        print ''
        print returns.corr()
        print returns.cov()
        print ''
        print returns.corrwith(returns.IBM)
        print returns.corrwith(volume)

    # unique, frequency, belong
    print '',''
    obj = Series(list('cadaabbcc'))
    uniques = obj.unique()
    print uniques
    print obj.value_counts()
    print pd.value_counts(obj.values, sort=False)
    mask = obj.isin(['b', 'c'])
    print mask
    print obj[mask]

    data = DataFrame({
        'Qu1' : [1,3,4,3,4],
        'Qu2' : [2,3,1,2,3],
        'Qu3' : [1,5,2,4,4],
    })
    print data
    print data.apply(pd.value_counts).fillna(0)
def create_property_values(
    row: pd.Series, scope: str, domain: str, dtypes: pd.Series
) -> dict:
    """
    This function generates the property values for a row in a file

    Parameters
    ----------
    row : pd.Series
        The current row of the data frame to create property values for
    scope : str
        The domain to create the property values in
    domain : str
        The domain to create the property values in
    dtypes : pd.Series
        The data types of each column to create property values for

    Returns
    -------
    properties : dict {str, models.PerpetualProperty}
    """

    # Ensure that all data types in the file have been mapped
    if not (
        set([str(data_type) for data_type in dtypes.unique()])
        <= set(global_constants["data_type_mapping"])
    ):
        raise TypeError(
            """There are data types in the data_frame which have not been mapped to LUSID data types,
            please ensure that all data types have been mapped before retrying"""
        )

    # Initialise the empty properties dictionary
    properties = {}

    # Iterate over each column name and data type
    for column_name, data_type in dtypes.iteritems():

        # Set the data type to be a string so that it is easier to work with
        string_data_type = str(data_type)
        # Convert the numpy data type to a LUSID data type using the global mapping
        lusid_data_type = global_constants["data_type_mapping"][string_data_type]
        # Get the value of the column from the row
        row_value = row[column_name]

        # Use the correct LUSID property value based on the data type
        if lusid_data_type == "string":
            if pd.isna(row_value):
                continue
            property_value = lusid.models.PropertyValue(label_value=row_value)

        if lusid_data_type == "number":
            # Handle null values given the input null value override
            if pd.isnull(row_value):
                continue
            property_value = lusid.models.PropertyValue(
                metric_value=lusid.models.MetricValue(value=row_value)
            )

        # Set the property
        property_key = (
            f"{domain}/{scope}/{cocoon.utilities.make_code_lusid_friendly(column_name)}"
        )
        properties[property_key] = lusid.models.PerpetualProperty(
            key=property_key, value=property_value
        )

    if domain.lower() == "instrument":
        properties = list(properties.values())

    return properties
Example #47
0
    def test_value_counts_inferred(self):
        klasses = [Index, Series]
        for klass in klasses:
            s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a']
            s = klass(s_values)
            expected = Series([4, 3, 2, 1], index=['b', 'a', 'd', 'c'])
            tm.assert_series_equal(s.value_counts(), expected)
            
            self.assert_numpy_array_equal(s.unique(), np.unique(s_values))
            self.assertEquals(s.nunique(), 4)
            # don't sort, have to sort after the fact as not sorting is platform-dep
            hist = s.value_counts(sort=False)
            hist.sort()
            expected = Series([3, 1, 4, 2], index=list('acbd'))
            expected.sort()
            tm.assert_series_equal(hist, expected)

            # sort ascending
            hist = s.value_counts(ascending=True)
            expected = Series([1, 2, 3, 4], index=list('cdab'))
            tm.assert_series_equal(hist, expected)

            # relative histogram.
            hist = s.value_counts(normalize=True)
            expected = Series([.4, .3, .2, .1], index=['b', 'a', 'd', 'c'])
            tm.assert_series_equal(hist, expected)

            # bins
            self.assertRaises(TypeError, lambda bins: s.value_counts(bins=bins), 1)

            s1 = Series([1, 1, 2, 3])
            res1 = s1.value_counts(bins=1)
            exp1 = Series({0.998: 4})
            tm.assert_series_equal(res1, exp1)
            res1n = s1.value_counts(bins=1, normalize=True)
            exp1n = Series({0.998: 1.0})
            tm.assert_series_equal(res1n, exp1n)

            self.assert_numpy_array_equal(s1.unique(), np.array([1, 2, 3]))
            self.assertEquals(s1.nunique(), 3)

            res4 = s1.value_counts(bins=4)
            exp4 = Series({0.998: 2, 1.5: 1, 2.0: 0, 2.5: 1}, index=[0.998, 2.5, 1.5, 2.0])
            tm.assert_series_equal(res4, exp4)
            res4n = s1.value_counts(bins=4, normalize=True)
            exp4n = Series({0.998: 0.5, 1.5: 0.25, 2.0: 0.0, 2.5: 0.25}, index=[0.998, 2.5, 1.5, 2.0])
            tm.assert_series_equal(res4n, exp4n)

            # handle NA's properly
            s_values = ['a', 'b', 'b', 'b', np.nan, np.nan, 'd', 'd', 'a', 'a', 'b']
            s = klass(s_values)
            expected = Series([4, 3, 2], index=['b', 'a', 'd'])
            tm.assert_series_equal(s.value_counts(), expected)

            self.assert_numpy_array_equal(s.unique(), np.array(['a', 'b', np.nan, 'd'], dtype='O'))
            self.assertEquals(s.nunique(), 3)

            s = klass({})
            expected = Series([], dtype=np.int64)
            tm.assert_series_equal(s.value_counts(), expected)
            self.assert_numpy_array_equal(s.unique(), np.array([]))
            self.assertEquals(s.nunique(), 0)

            # GH 3002, datetime64[ns]
            txt = "\n".join(['xxyyzz20100101PIE', 'xxyyzz20100101GUM', 'xxyyzz20100101EGG',
                             'xxyyww20090101EGG', 'foofoo20080909PIE', 'foofoo20080909GUM'])
            f = StringIO(txt)
            df = pd.read_fwf(f, widths=[6, 8, 3], names=["person_id", "dt", "food"],
                             parse_dates=["dt"])

            s = klass(df['dt'].copy())

            idx = pd.to_datetime(['2010-01-01 00:00:00Z', '2008-09-09 00:00:00Z', '2009-01-01 00:00:00X'])
            expected_s = Series([3, 2, 1], index=idx)
            tm.assert_series_equal(s.value_counts(), expected_s)

            expected = np.array(['2010-01-01 00:00:00Z', '2009-01-01 00:00:00Z', '2008-09-09 00:00:00Z'],
                                dtype='datetime64[ns]')
            if isinstance(s, DatetimeIndex):
                expected = DatetimeIndex(expected)
                self.assert_(s.unique().equals(expected))
            else:
                self.assert_numpy_array_equal(s.unique(), expected)

            self.assertEquals(s.nunique(), 3)

            # with NaT
            s = df['dt'].copy()
            s = klass([v for v in s.values] + [pd.NaT])

            result = s.value_counts()
            self.assertEqual(result.index.dtype, 'datetime64[ns]')
            expected_s[pd.NaT] = 1
            tm.assert_series_equal(result, expected_s)

            unique = s.unique()
            self.assertEqual(unique.dtype, 'datetime64[ns]')
            # numpy_array_equal cannot compare pd.NaT
            self.assert_numpy_array_equal(unique[:3], expected)
            self.assertTrue(unique[3] is pd.NaT or unique[3].astype('int64') == pd.tslib.iNaT)

            self.assertEquals(s.nunique(), 4)

            # timedelta64[ns]
            td = df.dt - df.dt + timedelta(1)
            td = klass(td)

            result = td.value_counts()
            expected_s = Series([6], index=[86400000000000])
            self.assertEqual(result.index.dtype, 'int64')
            tm.assert_series_equal(result, expected_s)

            # get nanoseconds to compare
            expected = np.array([86400000000000])
            self.assert_numpy_array_equal(td.unique(), expected)
            self.assertEquals(td.nunique(), 1)

            td2 = timedelta(1) + (df.dt - df.dt)
            td2 = klass(td2)
            result2 = td2.value_counts()

            self.assertEqual(result2.index.dtype, 'int64')
            tm.assert_series_equal(result2, expected_s)

            self.assert_numpy_array_equal(td.unique(), expected)
            self.assertEquals(td.nunique(), 1)
                              start=datetime.datetime(2010, 1, 1),
                              end=datetime.datetime(2013, 1, 1))['Adj Close']
prices.head()

volume = pdweb.get_data_yahoo(['CVX', 'XOM', 'BP'],
                              start=datetime.datetime(2010, 1, 1),
                              end=datetime.datetime(2013, 1, 1))['Volume']
volume.head()

rets = prices.pct_change()

# Correction of the stocks
rcorr = rets.corr

prices.plot()
volume.plot()

import seaborn as sns
import matplotlib.pyplot as plt

# seaborn correlation plot between pct change in stock price
sns.corrplot(rets, annot=False, diag_names=False)

prices.cov  # covariance method

# unique values of a series
ser1 = Series(['w', 'w', 'x', 'y', 'z', 'w', 'x', 'y', 'x', 'a'])
ser1.unique()

ser1.value_counts()
Example #49
0
            return choice(range(len(weights)), p=weights)
        else:
            return None
#lambda and stuff: is used to a function that generate samples..
#prove that sampling from norm distribution is unique..
################################
#Because I set α=10 (which is relatively small), the approximation
# is fairly course. In terms of memoization, a small α value means
# the stochastic memoizer will more frequently reuse values already
# seen instead of drawing new ones.
###############################
base_measure = lambda: norm().rvs()
ndraws = 10000
print("Number of unique samples after {} draws..".format(ndraws))
draws = Series([base_measure() for _ in range(ndraws)])
print(draws.unique().size)
################################
norm_dp = DirichletProcessSample(base_measure, alpha=100)
print("Number of unique samples after {} draws:".format(ndraws))
dp_draws = Series([norm_dp() for _ in range(ndraws)])
print(dp_draws.unique().size)
#################################
Series(norm_dp() for _ in range(10000)).hist()
_=plt.title("Histogram of Samples from norm_dp")
plt.show()
################################
norm_hdp = DirichletProcessSample(norm_dp, alpha=10) #samples..
Series(norm_hdp() for _ in range(10000)).hist()
_=plt.title("Histogram of Samples from norm_hdp")
plt.show()
###############################
Example #50
0
all_data = {}
for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']:
    print("get data:" + ticker)
    all_data[ticker] = web.get_data_yahoo(ticker, '1/1/2010', '1/30/2010')
price = DataFrame({tic: data['Adj Close'] for tic, data in all_data.items()})
volume = DataFrame({tic: data['Volume'] for tic, data in all_data.items()})

returns = price.pct_change()
print(returns.tail())
print(returns.MSFT.corr(returns.IBM))
print(returns.MSFT.cov(returns.IBM))
print(returns.corr())
print(returns.cov())
print(returns.corrwith(returns.IBM))
print(returns.corrwith(volume))

obj = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
print(obj.unique())
print(obj.value_counts())
print(pd.value_counts(obj.values, sort=False))
mask = obj.isin(['b', 'c'])
print(mask)
print(obj[mask])

data = DataFrame({
    'QU1': [1, 3, 4, 3, 4],
    'QU2': [2, 3, 1, 2, 3],
    'QU3': [1, 5, 2, 4, 4]
})
print(data.apply(pd.value_counts).fillna(0))