Example #1
0
    def test_categorical_nans(self):
        s = Series(pd.Categorical(list('aaaaabbbcc')))  # 4,3,2,1 (nan)
        s.iloc[1] = np.nan
        result = s.value_counts()
        expected = pd.Series([4, 3, 2], index=pd.CategoricalIndex(
            ['a', 'b', 'c'], categories=['a', 'b', 'c']))
        tm.assert_series_equal(result, expected, check_index_type=True)
        result = s.value_counts(dropna=False)
        expected = pd.Series([
            4, 3, 2, 1
        ], index=pd.CategoricalIndex(['a', 'b', 'c', np.nan]))
        tm.assert_series_equal(result, expected, check_index_type=True)

        # out of order
        s = Series(pd.Categorical(
            list('aaaaabbbcc'), ordered=True, categories=['b', 'a', 'c']))
        s.iloc[1] = np.nan
        result = s.value_counts()
        expected = pd.Series([4, 3, 2], index=pd.CategoricalIndex(
            ['a', 'b', 'c'], categories=['b', 'a', 'c'], ordered=True))
        tm.assert_series_equal(result, expected, check_index_type=True)

        result = s.value_counts(dropna=False)
        expected = pd.Series([4, 3, 2, 1], index=pd.CategoricalIndex(
            ['a', 'b', 'c', np.nan], categories=['b', 'a', 'c'], ordered=True))
        tm.assert_series_equal(result, expected, check_index_type=True)
Example #2
0
class ValueCounts(object):

    params = ['int', 'float', 'object']
    param_names = ['dtype']

    def setup(self, dtype):
        self.s = Series(np.random.randint(0, 1000, size=100000)).astype(dtype)

    def time_value_counts(self, dtype):
        self.s.value_counts()
Example #3
0
    def test_value_counts_bins(self):
        klasses = [Index, Series]
        for klass in klasses:
            s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a']
            s = klass(s_values)

            # bins
            self.assertRaises(TypeError,
                              lambda bins: s.value_counts(bins=bins), 1)

            s1 = Series([1, 1, 2, 3])
            res1 = s1.value_counts(bins=1)
            exp1 = Series({0.998: 4})
            tm.assert_series_equal(res1, exp1)
            res1n = s1.value_counts(bins=1, normalize=True)
            exp1n = Series({0.998: 1.0})
            tm.assert_series_equal(res1n, exp1n)

            self.assert_numpy_array_equal(s1.unique(),
                                          np.array([1, 2, 3], dtype=np.int64))
            self.assertEqual(s1.nunique(), 3)

            res4 = s1.value_counts(bins=4)
            exp4 = Series({0.998: 2,
                           1.5: 1,
                           2.0: 0,
                           2.5: 1}, index=[0.998, 2.5, 1.5, 2.0])
            tm.assert_series_equal(res4, exp4)
            res4n = s1.value_counts(bins=4, normalize=True)
            exp4n = Series(
                {0.998: 0.5,
                 1.5: 0.25,
                 2.0: 0.0,
                 2.5: 0.25}, index=[0.998, 2.5, 1.5, 2.0])
            tm.assert_series_equal(res4n, exp4n)

            # handle NA's properly
            s_values = ['a', 'b', 'b', 'b', np.nan, np.nan,
                        'd', 'd', 'a', 'a', 'b']
            s = klass(s_values)
            expected = Series([4, 3, 2], index=['b', 'a', 'd'])
            tm.assert_series_equal(s.value_counts(), expected)

            exp = np.array(['a', 'b', np.nan, 'd'], dtype=np.object_)
            self.assert_numpy_array_equal(s.unique(), exp)
            self.assertEqual(s.nunique(), 3)

            s = klass({})
            expected = Series([], dtype=np.int64)
            tm.assert_series_equal(s.value_counts(), expected,
                                   check_index_type=False)
            # returned dtype differs depending on original
            self.assert_numpy_array_equal(s.unique(), np.array([]),
                                          check_dtype=False)
            self.assertEqual(s.nunique(), 0)
Example #4
0
    def test_categorical(self):
        s = Series(pd.Categorical(list('aaabbc')))
        result = s.value_counts()
        expected = pd.Series([3, 2, 1],
                             index=pd.CategoricalIndex(['a', 'b', 'c']))
        tm.assert_series_equal(result, expected, check_index_type=True)

        # preserve order?
        s = s.cat.as_ordered()
        result = s.value_counts()
        expected.index = expected.index.as_ordered()
        tm.assert_series_equal(result, expected, check_index_type=True)
Example #5
0
    def test_value_counts(self):
        s = Series(['a', 'b', 'b', 'b', 'b', 'a', 'c', 'd', 'd', 'a'])
        hist = s.value_counts()
        expected = Series([4, 3, 2, 1], index=['b', 'a', 'd', 'c'])
        assert_series_equal(hist, expected)

        # handle NA's properly
        s[5:7] = np.nan
        hist = s.value_counts()
        expected = s.dropna().value_counts()
        assert_series_equal(hist, expected)

        s = Series({})
        hist = s.value_counts()
        expected = Series([])
        assert_series_equal(hist, expected)
def main():
    path = 'usagov_bitly_data2013-05-17-1368832207'
    records = [json.loads(line) for line in open(path)]
    tzs = [rec['tz'] for rec in records if 'tz' in rec]
    counts = getCounts(tzs)
    print counts
    top10 = topCounts(counts)
    print top10

    """Pandas DataFrame demo"""
    frame = DataFrame(records)
    clean_tz = frame['tz'].fillna('Missing')
    clean_tz[clean_tz == ''] = 'Unknown'
    tz_counts = clean_tz.value_counts()
    print tz_counts[:10]
    tz_counts[:10].plot(kind='barh', rot=0)
    plt.show()

    """Pandas Series demo"""
    results = Series([x.split()[0] for x in frame['a'].dropna()])
    agents_counts = results.value_counts()
    print agents_counts[:8]

    cframe = frame[frame['a'].notnull()]
    os_seq = np.where(cframe['a'].str.contains('Windows'), 'Windows', 'Not Windows')
    print os_seq[:8]
    by_tz_os = cframe.groupby(['tz', os_seq])
    agg_counts = by_tz_os.size().unstack().fillna(0)
    print agg_counts[:8]
    indexer = agg_counts.sum(1).argsort()
    print indexer[:8]
    counts_subset = agg_counts.take(indexer)[:10]
    counts_subset.plot(kind='barh', stacked=True)
    plt.show()
Example #7
0
def count_fun4(recrods):

    frame = DataFrame(recrods)
    results = Series([x.split()[0] for x in frame.a.dropna()])
    print results[:5]
    counts = results.value_counts()[:10]
    counts.plot(kind='barh', rot=0)
    plt.show()
Example #8
0
def top10(tokens, text):
    obj = Series(tokens)
    top10 = obj.value_counts()[:10]
    print(top10)
    
    top10_list = list(top10.keys())
    text.dispersion_plot(top10_list)
    return top10_list
Example #9
0
 def test_categorical_zeroes(self):
     # keep the `d` category with 0
     s = Series(pd.Categorical(
         list('bbbaac'), categories=list('abcd'), ordered=True))
     result = s.value_counts()
     expected = Series([3, 2, 1, 0], index=pd.Categorical(
         ['b', 'a', 'c', 'd'], categories=list('abcd'), ordered=True))
     tm.assert_series_equal(result, expected, check_index_type=True)
Example #10
0
class Algorithms(object):

    params = ['index', 'series']
    param_names = ['typ']

    def setup(self, typ):
        data = [Period('2011-01', freq='M'), Period('2011-02', freq='M'),
                Period('2011-03', freq='M'), Period('2011-04', freq='M')]

        if typ == 'index':
            self.vector = PeriodIndex(data * 1000, freq='M')
        elif typ == 'series':
            self.vector = Series(data * 1000)

    def time_drop_duplicates(self, typ):
        self.vector.drop_duplicates()

    def time_value_counts(self, typ):
        self.vector.value_counts()
Example #11
0
class period_algorithm(object):
    goal_time = 0.2

    def setup(self):
        data = [Period('2011-01', freq='M'), Period('2011-02', freq='M'),
                Period('2011-03', freq='M'), Period('2011-04', freq='M')]
        self.s = Series(data * 1000)
        self.i = PeriodIndex(data, freq='M')

    def time_period_series_drop_duplicates(self):
        self.s.drop_duplicates()

    def time_period_index_drop_duplicates(self):
        self.i.drop_duplicates()

    def time_period_series_value_counts(self):
        self.s.value_counts()

    def time_period_index_value_counts(self):
        self.i.value_counts()
def main():
    path = '1_usagov_bitly_data2012-03-16-1331923249.txt'
    records = [json.loads(line) for line in open(path)]
    frame = DataFrame(records)

    tz_counts = frame['tz'].value_counts()
    print "Top timezones:"
    print tz_counts[:10]
    print ""

    clean_tz = frame['tz'].fillna('Missing')
    clean_tz[clean_tz == ''] = 'Unknown'
    tz_counts2 = clean_tz.value_counts()
    print "Cleaned top timezones:"
    print tz_counts2[:10]
    print ""

    agents = Series([x.split()[0] for x in frame['a'].dropna()])
    print "Top User Agents:"
    print agents.value_counts()[:10]
    print

    cframe = frame[frame['a'].notnull()]
    operating_system = np.where(
        cframe['a'].str.contains('Windows'),
        'Windows',
        'Not Windows'
    )
    by_timezone_os = cframe.groupby(['tz', operating_system])
    agg_counts = by_timezone_os.size().unstack()
    agg_counts.fillna(0, inplace=True)
    timezone_totals = agg_counts.sum(1).argsort()
    count_subset = agg_counts.take(timezone_totals)[-10:]
    print "OS split by top timezones by counts:"
    print count_subset
    print ""
Example #13
0
class Algorithms(object):
    goal_time = 0.2

    def setup(self):
        data = [
            Period("2011-01", freq="M"),
            Period("2011-02", freq="M"),
            Period("2011-03", freq="M"),
            Period("2011-04", freq="M"),
        ]
        self.s = Series(data * 1000)
        self.i = PeriodIndex(data, freq="M")

    def time_drop_duplicates_pseries(self):
        self.s.drop_duplicates()

    def time_drop_duplicates_pindex(self):
        self.i.drop_duplicates()

    def time_value_counts_pseries(self):
        self.s.value_counts()

    def time_value_counts_pindex(self):
        self.i.value_counts()
Example #14
0
def Main():
  client = github_helpers.authenticate()
  keywords = raw_input("Please, enter keywords to search repositories: ")
  if keywords is '':
    keywords = 'javascript'
    print 'No keywords provided. It will use the keyword: ' + keywords
  search = client.search_repositories(keywords)
  first_page = search.get_page(0)

  languages = Series(r.language for r in first_page)
  languages = languages.dropna()
  languages.sort()

  percentages = (100.0 * languages.value_counts() / len(languages)).map('{:,.2f} %'.format)

  print 'Languages percentage:'
  print percentages

  # Create plot
  x = [int(r.stargazers_count) for r in first_page]
  y = [int(r.forks) for r in first_page]

  # Add one to every value for logarithmic scale
  x = [val + 1 for val in x]
  y = [val + 1 for val in y]

  area = [100 for r in first_page]
  names = [r.name for r in first_page]
  colors = np.random.rand(len(first_page))
  pl.scatter(x, y, s=area, c=colors, alpha=0.5)
  for i in range(0, len(x)):
    pl.annotate(names[i], (x[i], y[i]), fontsize=2)
  pl.title("All values are with addition of 1 (for the logarithmic scale)")
  pl.xlabel("Stars")
  pl.xscale("log")
  pl.yscale("log")
  pl.ylabel("Forks")
  pl.tight_layout()
  filepath = 'reports/APIs/github'
  if not os.path.isdir(filepath): os.makedirs(filepath)
  filepath += '/search_repositories.png'
  pl.savefig(filepath, figsize=(1020, 1020), dpi=300)
  pl.close()
  print('A chart with high resolution and small font size (to minimize overlaps) was created at ' +
    filepath)
Example #15
0
def analysis2(records):
    print('\nPandas analysis >>')
    frame = DataFrame(records)
    # pp.pprint(frame)
    # tz_counts = frame['tz'].value_counts()
    # pp.pprint(tz_counts[:10])
    clean_tz = frame['tz'].fillna('Missing')
    clean_tz[clean_tz == ''] = 'Unknown'
    tz_counts = clean_tz.value_counts()
    print('\n>> Time zones')
    pp.pprint(tz_counts[:10])

    # plt.figure(figsize=(10, 4))
    # tz_counts[:10].plot(kind='barh', rot=0)
    # plt.show()

    clean_url = frame.a.fillna('Missing')
    clean_url[clean_url == ''] = 'Unknown'
    urls = Series([x.split()[0] for x in clean_url])
    urls_counts = urls.value_counts()
    print('\n >> URLs')
    pp.pprint(urls_counts[:10])

    cframe = frame[frame.a.notnull()]
    operation_system = np.where(cframe.a.str.contains('Windows'),
                                'Windows', 'Not Windows')
    print('\n>> OS')
    # print(operation_system[:5])
    by_tz_os = cframe.groupby(['tz', operation_system])
    agg_counts = by_tz_os.size().unstack().fillna(0)
    indexer = agg_counts.sum(1).argsort()
    count_subset = agg_counts.take(indexer)[-10:]
    pp.pprint(count_subset)

    plt.figure(figsize=(10, 4))
    count_subset.plot(kind='barh', rot=0, stacked=True)
    plt.show()
Example #16
0
        classfier = GaussianNB()
        classfier.fit(features_train, target_train)
        prediction = classfier.predict(features_test)

        accuracy = accuracy_score(target_test, prediction, normalize=True)
        print 'accuracy generated by scikit-learn: {}%'.format(accuracy * 100)
        print '-' * 50

        return features, target, accuracy, prediction

    sci_features, sci_target, sci_accuracy, sci_predictions = scikit_Gaussian(
        _dataset, split_ratio)

    #-- plotting predictions for our model against real counts:
    p_samples = Series(predictions)
    p_counts = p_samples.value_counts()
    t_sample = [item[-1] for item in test_set]
    t_samples = Series(t_sample[:len(predictions)])
    t_counts = t_samples.value_counts()
    title = 'Comparing diabetes real classes counts to our model prediction'
    legends = ['Our model', 'Real data']
    plotting_data(t_counts, p_counts, title, legends)

    #-- plotting predictions for sci-kit learn model against real counts:
    p_samples = Series(sci_predictions)
    p_counts = p_samples.value_counts()
    t_samples = Series(sci_target[-len(sci_predictions):])
    t_counts = t_samples.value_counts()
    title = 'Comparing diabetes real classes counts to sci-kit learn model prediction'
    legends = ['Sci-kit model', 'Real data']
    plotting_data(t_counts, p_counts, title, legends)
Example #17
0
print('-'*50)
print(myFrame.sum(axis = 1))
print('-'*50)
print(myFrame.cumsum()) #누산메소드 - 누적합을 구해줌
print('-'*50)
print(myFrame.mean(axis = 0))
print('-'*50)
print(myFrame.mean(axis = 1))
print('-'*50)
print(myFrame.mean(axis = 1, skipna = False))
print('-'*50)
print(myFrame.describe()) #간단한 통계치 정보/%:4분위 데이터/std:표준편차
print('-'*50)
print(myFrame.idxmax())
print('-'*50)
mySeries = Series(['a', 'a', 'b', 'c', 'd'] * 2) 
#unique:중복되지 않는 데이터/top:빈도수가 가장 많은거/freq:top의 횟수
print(mySeries.describe())
print('-'*50)
print(mySeries)
myUnique = mySeries.unique()
print(myUnique)
print('-'*50)
print(mySeries.value_counts())
print('-'*50)
print(pd.value_counts(mySeries.values, sort=False))
print('-'*50)
mask = mySeries.isin(['b','c'])
print(mask)
print(mySeries[mask])
Example #18
0
for tic, data in all_data.iteritems()})
# percent changes of the prices:
returns = price.pct_change()
returns.tails()
returns.tail()
returns.MSFT.corr(returns.IBM) # correlation of the overlapping non-NA
returns.MSFT.cov(returns.IBM) # covariance of the overlapping non-NA
returns.corr()
returns.cov()
returns.corrwith(returns.IBM)
returns.corrwith(volume)
## Unique values, Value counts, and membership
obj = Series(['c', 'a', 'd', 'a', 'a', 'b', 'b', 'c', 'c'])
uniques = obj.unique()
uniques
obj.value_counts()
obj.value_counts() # value frequencies
from pandas import value_counts
value_counts(obj.values, sort=False)
obj
mask = obj.isin(['b', 'c'])
obj[mask]
mask
data = DataFrame({'Qu1': [1, 3, 4, 3, 4],
'Qu2': [2, 3, 1, 2, 3],
'Qu3': [1, 5, 2, 4, 4]})
data
data.Qu1
data.Qu1.value_counts
data.Qu1.value_counts()
result = data.apply(value_counts).fillna(0)
Example #19
0
tz_counts[:10]

#画图

tz_counts[:10].plot(kind= 'barh', rot=0)

frame['a'][49]

results = Series([x.split()[0] for x in frame.a.dropna()])
#x.split()把字符串拆成列表 [0]表示取第几个分片
#DataFrame.dropna()删除缺失数据
#对于一个 Series,dropna 返回一个仅含非空数据和索引值的 Series。
#问题在于对 DataFrame 的处理方式,因为一旦 drop 的话,至少要丢掉一行(列)。这里的解决方式与前面类似,还是通过一个额外的参数:dropna(axis=0, how='any', thresh=None) ,how 参数可选的值为 any 或者 all。all 仅在切片元素全为 NA 时才抛弃该行(列)。另外一个有趣的参数是 thresh,该参数的类型为整数,它的作用是,比如 thresh=3,会在一行中至少有 3 个非 NA 值时将其保留。
results[:5]

results.value_counts()[:8]

#按Windows和非Windows用户对时区信息进行分解
#为了简单起见,我们假定只要agent字符中包含windows,就认为该用户为windows用户
#由于有agent缺失,先用notnull将它们从数据中移除

cframe = frame[frame.a.notnull()]
#
#is(not)null返回index和Boolean值 其实这个是numpy的Boolean值索引
#这一对方法对对象做元素级应用,然后返回一个布尔型数组,一般可用于布尔型索引。 

operating_system = np.where(cframe['a'].str.contains('Windows'), 'Windows', 'Not Windows')

operating_system[:9]

operating_system1 = np.where(cframe['a'].str.contains('Windows')[, 'Windows', 'Not Windows'])
Example #20
0
# csv 파일로 저장
import csv
import pandas as pd

try:
    f = csv.writer(open('ws1.csv', 'w', encoding='utf-8'))
    f.writerow(word_dict)
except Exception as e:
    print('err : ', e)

# df1 = pd.read_csv('ws1.csv', encoding='utf-8')
# print(df1)

with open('ws1.csv', 'r', encoding='utf-8') as f:
    print(f.read())

print()
from pandas import Series, DataFrame
li_data = Series(wordlist)
#print(li_data)
print(li_data.value_counts()[:5])
print()
li_data = Series(word_dict)
print(li_data.value_counts()[:5])

print('-----------------')
df = DataFrame(wordlist, columns=['단어'])
print(df.head())

###############################################################
Example #21
0
from pandas import Series, DataFrame
import matplotlib.pyplot as plt

array1 = np.array([[10, np.nan, 20], [30, 40, np.nan]])
print(array1)
df1 = DataFrame(array1, index=[1, 2], columns=list('ABC'))
print(df1)
#sum()
print(df1.sum())
print(df1.sum(axis=1))

#min
print(df1.min())
print(df1.max())

print(df1.idxmax())
print(df1.idxmin())
print(df1.cumsum())
print(df1.describe())

df2 = DataFrame(randn(9).reshape(3, 3), index=[1, 2, 3], columns=list('ABC'))
print(df2)
plt.plot(df2)
plt.legend(df2.columns, loc="lower right")
plt.savefig('samplepic.png')
plt.show()

ser1 = Series(list('abcccaabd'))
print(ser1.unique())
print(ser1.value_counts())
Example #22
0
    def fit(self, X: np.ndarray, Z: np.ndarray, clusters: pd.Series,
            y: np.ndarray):
        """
        Fit MERF using EM algorithm.
        :param X (np.ndarray): fixed effect covariates
        :param Z (np.ndarray): random effect covariates
        :param clusters (pd.Series): cluster assignments for samples
        :param y (np.ndarray): response/target variable
        :return: fitted model
        """
        if type(clusters) != pd.Series:
            raise TypeError("clusters must be a pandas Series.")

        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Input Checks ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        assert len(Z) == len(X)
        assert len(y) == len(X)
        assert len(clusters) == len(X)

        # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Initialization ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        n_clusters = clusters.nunique()
        n_obs = len(y)
        q = Z.shape[1]  # random effects dimension
        Z = np.array(
            Z
        )  # cast Z to numpy array (required if it's a dataframe, otw, the matrix mults later fail)

        # Create a series where cluster_id is the index and n_i is the value
        cluster_counts = clusters.value_counts()

        # Do expensive slicing operations only once
        Z_by_cluster = {}
        y_by_cluster = {}
        n_by_cluster = {}
        I_by_cluster = {}
        indices_by_cluster = {}

        # TODO: Can these be replaced with groupbys? Groupbys are less understandable than brute force.
        for cluster_id in cluster_counts.index:
            # Find the index for all the samples from this cluster in the large vector
            indices_i = clusters == cluster_id
            indices_by_cluster[cluster_id] = indices_i

            # Slice those samples from Z and y
            Z_by_cluster[cluster_id] = Z[indices_i]
            y_by_cluster[cluster_id] = y[indices_i]

            # Get the counts for each cluster and create the appropriately sized identity matrix for later computations
            n_by_cluster[cluster_id] = cluster_counts[cluster_id]
            I_by_cluster[cluster_id] = np.eye(cluster_counts[cluster_id])

        # Intialize for EM algorithm
        iteration = 0
        # Note we are using a dataframe to hold the b_hat because this is easier to index into by cluster_id
        # Before we were using a simple numpy array -- but we were indexing into that wrong because the cluster_ids
        # are not necessarily in order.
        b_hat_df = pd.DataFrame(np.zeros((n_clusters, q)),
                                index=cluster_counts.index)
        sigma2_hat = 1
        D_hat = np.eye(q)

        # vectors to hold history
        self.b_hat_history.append(b_hat_df)
        self.sigma2_hat_history.append(sigma2_hat)
        self.D_hat_history.append(D_hat)

        early_stop_flag = False

        while iteration < self.max_iterations and not early_stop_flag:
            iteration += 1
            logger.debug("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
            logger.debug("Iteration: {}".format(iteration))
            logger.debug("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")

            # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ E-step ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
            # fill up y_star for all clusters
            y_star = np.zeros(len(y))
            for cluster_id in cluster_counts.index:
                # Get cached cluster slices
                y_i = y_by_cluster[cluster_id]
                Z_i = Z_by_cluster[cluster_id]
                b_hat_i = b_hat_df.loc[cluster_id]  # used to be ix
                logger.debug("E-step, cluster {}, b_hat = {}".format(
                    cluster_id, b_hat_i))
                indices_i = indices_by_cluster[cluster_id]

                # Compute y_star for this cluster and put back in right place
                y_star_i = y_i - Z_i.dot(b_hat_i)
                y_star[indices_i] = y_star_i

            # check that still one dimensional
            # TODO: Other checks we want to do?
            assert len(y_star.shape) == 1

            # Do the fixed effects regression with all the fixed effects features
            self.fe_model.fit(X, y_star)
            f_hat = self.fe_model.predict(X)

            # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ M-step ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
            sigma2_hat_sum = 0
            D_hat_sum = 0

            for cluster_id in cluster_counts.index:
                # Get cached cluster slices
                indices_i = indices_by_cluster[cluster_id]
                y_i = y_by_cluster[cluster_id]
                Z_i = Z_by_cluster[cluster_id]
                n_i = n_by_cluster[cluster_id]
                I_i = I_by_cluster[cluster_id]

                # index into f_hat
                f_hat_i = f_hat[indices_i]

                # Compute V_hat_i
                V_hat_i = Z_i.dot(D_hat).dot(Z_i.T) + sigma2_hat * I_i

                # Compute b_hat_i
                V_hat_inv_i = np.linalg.pinv(V_hat_i)
                logger.debug(
                    "M-step, pre-update, cluster {}, b_hat = {}".format(
                        cluster_id, b_hat_df.loc[cluster_id]))
                b_hat_i = D_hat.dot(Z_i.T).dot(V_hat_inv_i).dot(y_i - f_hat_i)
                logger.debug(
                    "M-step, post-update, cluster {}, b_hat = {}".format(
                        cluster_id, b_hat_i))

                # Compute the total error for this cluster
                eps_hat_i = y_i - f_hat_i - Z_i.dot(b_hat_i)

                logger.debug("------------------------------------------")
                logger.debug("M-step, cluster {}".format(cluster_id))
                logger.debug("error squared for cluster = {}".format(
                    eps_hat_i.T.dot(eps_hat_i)))

                # Store b_hat for cluster both in numpy array and in dataframe
                # Note this HAS to be assigned with loc, otw whole df get erroneously assigned and things go to hell
                b_hat_df.loc[cluster_id, :] = b_hat_i
                logger.debug(
                    "M-step, post-update, recalled from db, cluster {}, "
                    "b_hat = {}".format(cluster_id, b_hat_df.loc[cluster_id]))

                # Update the sums for sigma2_hat and D_hat. We will update after the entire loop over clusters
                sigma2_hat_sum += eps_hat_i.T.dot(eps_hat_i) + sigma2_hat * (
                    n_i - sigma2_hat * np.trace(V_hat_inv_i))
                D_hat_sum += np.outer(b_hat_i, b_hat_i) + (D_hat - D_hat.dot(
                    Z_i.T).dot(V_hat_inv_i).dot(Z_i).dot(D_hat))  # noqa: E127

            # Normalize the sums to get sigma2_hat and D_hat
            sigma2_hat = (1.0 / n_obs) * sigma2_hat_sum
            D_hat = (1.0 / n_clusters) * D_hat_sum

            logger.debug("b_hat = {}".format(b_hat_df))
            logger.debug("sigma2_hat = {}".format(sigma2_hat))
            logger.debug("D_hat = {}".format(D_hat))

            # Store off history so that we can see the evolution of the EM algorithm
            self.b_hat_history.append(b_hat_df.copy())
            self.sigma2_hat_history.append(sigma2_hat)
            self.D_hat_history.append(D_hat)

            # Generalized Log Likelihood computation to check convergence
            gll = 0
            for cluster_id in cluster_counts.index:
                # Get cached cluster slices
                indices_i = indices_by_cluster[cluster_id]
                y_i = y_by_cluster[cluster_id]
                Z_i = Z_by_cluster[cluster_id]
                I_i = I_by_cluster[cluster_id]

                # Slice f_hat and get b_hat
                f_hat_i = f_hat[indices_i]
                R_hat_i = sigma2_hat * I_i
                b_hat_i = b_hat_df.loc[cluster_id]

                # Numerically stable way of computing log(det(A))
                _, logdet_D_hat = np.linalg.slogdet(D_hat)
                _, logdet_R_hat_i = np.linalg.slogdet(R_hat_i)

                gll += ((y_i - f_hat_i - Z_i.dot(b_hat_i)).T.dot(
                    np.linalg.pinv(R_hat_i)).dot(y_i - f_hat_i -
                                                 Z_i.dot(b_hat_i)) +
                        b_hat_i.T.dot(np.linalg.pinv(D_hat)).dot(b_hat_i) +
                        logdet_D_hat + logdet_R_hat_i)  # noqa: E127

            logger.info("GLL is {} at iteration {}.".format(gll, iteration))
            self.gll_history.append(gll)

            # Early Stopping. This code is entered only if the early stop threshold is specified and
            # if the gll_history array is longer than 1 element, e.g. we are past the first iteration.
            if self.gll_early_stop_threshold is not None and len(
                    self.gll_history) > 1:
                curr_threshold = np.abs(
                    (gll - self.gll_history[-2]) / self.gll_history[-2])
                logger.debug("stop threshold = {}".format(curr_threshold))

                if curr_threshold < self.gll_early_stop_threshold:
                    logger.info(
                        "Gll {} less than threshold {}, stopping early ...".
                        format(gll, curr_threshold))
                    early_stop_flag = True

        # Store off trained fixed effects model and b_hat as the model to be used in the prediction stage
        self.cluster_counts = cluster_counts
        self.trained_fe_model = self.fe_model
        self.trained_b = b_hat_df
        self.b_hat_history_df = self._convert_bhat_history(self.b_hat_history)

        return self
def main():
    """
    Calculation and aggregation of summary statistics
    """

    # Summary of statistics
    # return is not ndarray
    df = DataFrame([[1.4, np.nan],
                    [7.1, -4.5],
                    [np.nan, np.nan],
                    [0.75, -1.3]],
                   index=list('abcd'),
                   columns=['one', 'two'])
    print df
    print df.sum()
    print df.sum(axis=1)
    print df.mean(axis=1) # exclude nan
    print df.mean(axis=1, skipna=False)
    print df.idxmin()
    print df.idxmax()
    print df.cumsum()
    print df.describe()
    # values are not number
    obj = Series(list('aabc') * 4)
    print obj.describe()


    methods = ['count', 'min', 'max', # 'argmin', 'argmax',
               'quantile', 'median', 'mad', 'var', 'std',
               'skew', 'kurt', 'cummin', 'cummax', 'cumprod',
               'diff', 'pct_change']

    for method in methods:
        print u'「{0}」'.format(method)
        print getattr(df, method)()
        print ''

    # Correspond and Covariance
    all_data = {}
    lst = [] # ['AAPL', 'IBM', 'MSFT'] #, 'GOOG']:
    for ticket in lst: #, 'GOOG']:
        # IOError: after 3 tries, Yahoo! did not return a 200
        # for url 'http://ichart.finance.yahoo.com/table.csv?s=GOOG&a=0&b=1&c=2000&d=0&e=1&f=2010&g=d&ignore=.csv'
        all_data[ticket] = pd.io.data.get_data_yahoo(ticket, '1/1/2000', '1/1/2010')
    price = DataFrame({tic: data['Adj Close'] for tic, data in all_data.iteritems()})
    volume = DataFrame({tic: data['Volume'] for tic, data in all_data.iteritems()})
    if all_data:
        returns = price.pct_change()
        print returns.tail()
        print ''
        print returns.MSFT.corr(returns.IBM)
        print returns.MSFT.cov(returns.IBM)
        print ''
        print returns.corr()
        print returns.cov()
        print ''
        print returns.corrwith(returns.IBM)
        print returns.corrwith(volume)

    # unique, frequency, belong
    print '',''
    obj = Series(list('cadaabbcc'))
    uniques = obj.unique()
    print uniques
    print obj.value_counts()
    print pd.value_counts(obj.values, sort=False)
    mask = obj.isin(['b', 'c'])
    print mask
    print obj[mask]

    data = DataFrame({
        'Qu1' : [1,3,4,3,4],
        'Qu2' : [2,3,1,2,3],
        'Qu3' : [1,5,2,4,4],
    })
    print data
    print data.apply(pd.value_counts).fillna(0)
Example #24
0
print(frame2.values[0, 2])

print()
#frame3 = frame2.drop('d')
frame3 = frame2.drop('d', axis=0)  # d행 을 삭제
print(frame3)
frame4 = frame2.drop('tel', axis=1)  # tel 칼럼 열 삭제
print(frame4)

print()
print(frame3.sort_index(axis=0,
                        ascending=False))  # 행단위로 정렬  ascending = false 시 내림차순
print(frame3.sort_index(axis=1, ascending=False))  # 열단위로 정렬

print(frame3.rank(axis=0))

print()
print(frame3['juso'].value_counts())  #주소별 그룹핑 카운트

print()
data = {'juso': ['강남구 역삼동', '중구 신당동', '강남구 대치동'], 'inwon': [23, 25, 15]}

frame = DataFrame(data)
print(frame)
result1 = Series([x.split()[0] for x in frame.juso])  # juso 에서 구 단위로 자름
result2 = Series((x.split()[0] for x in frame.juso))  # juso 에서 구 단위로 자름
print(result1)
print(result2)

print(result2.value_counts())  # 구별로 데이터 카운트 출력
Example #25
0
# missing values
clean_tz = frame['tz'].fillna('Missing')
clean_tz[clean_tz == ''] = 'Unknown'
tz_counts = clean_tz.value_counts()
print(tz_counts[:10])

# plot it
import matplotlib
tz_counts[:10].plot(kind='bar', rot=0)

print(frame['a'][1])

# user agents
agents = Series([x.split()[0] for x in frame.a.dropna()])
print(agents[:5])
print(agents.value_counts()[:8])

# OS
import numpy as np
cframe = frame[frame.a.notnull()]
oses = np.where(cframe['a'].str.contains('Windows'), 'Windows', 'Non-Windows')
print(oses[:5])

# grouping
by_tz_os = cframe.groupby(['tz', oses])
agg_counts = by_tz_os.size().unstack().fillna(0)
print(agg_counts[:10])

# top overall time zones
indexer = agg_counts.sum(1).argsort()
print(indexer[:10])
Example #26
0
returns.IBM.head()
returns.ix[:,2].corr(returns.ix[:,3])

returns.corr()

returns.cov()
returns.corrwith(returns.IBM)   #
returns.corrwith(volume)    # 按列名匹配


# In[241]:

#唯一值、值计数以及成员资格
obj = Series(['c','a','d ','a','a','b','b','c','c'])
obj.unique()
obj.value_counts(sort=False)
obj.isin(['b','c'])


# In[584]:

import pandas as pd

data=DataFrame({'qu1':[1,3,4,3,4],
                'qu2':[2,3,2,2,3],
                'qu3':[1,5,2,4,5]})
result = data.apply(pd.value_counts).fillna(0)
result


# In[594]:
Example #27
0
 def fit(self, values: pd.Series):
     self.levels = values.dropna().unique()
     self.encoder = values.value_counts().to_dict()
Example #28
0
 def fit_transform(self, values: pd.Series):
     self.levels = values.unique()
     self.encoder = values.value_counts().to_dict()
     return values.map(self.encoder)
Example #29
0
    def test_float64index_slicing_bug(self):
        # GH 5557, related to slicing a float index
        ser = {256: 2321.0,
               1: 78.0,
               2: 2716.0,
               3: 0.0,
               4: 369.0,
               5: 0.0,
               6: 269.0,
               7: 0.0,
               8: 0.0,
               9: 0.0,
               10: 3536.0,
               11: 0.0,
               12: 24.0,
               13: 0.0,
               14: 931.0,
               15: 0.0,
               16: 101.0,
               17: 78.0,
               18: 9643.0,
               19: 0.0,
               20: 0.0,
               21: 0.0,
               22: 63761.0,
               23: 0.0,
               24: 446.0,
               25: 0.0,
               26: 34773.0,
               27: 0.0,
               28: 729.0,
               29: 78.0,
               30: 0.0,
               31: 0.0,
               32: 3374.0,
               33: 0.0,
               34: 1391.0,
               35: 0.0,
               36: 361.0,
               37: 0.0,
               38: 61808.0,
               39: 0.0,
               40: 0.0,
               41: 0.0,
               42: 6677.0,
               43: 0.0,
               44: 802.0,
               45: 0.0,
               46: 2691.0,
               47: 0.0,
               48: 3582.0,
               49: 0.0,
               50: 734.0,
               51: 0.0,
               52: 627.0,
               53: 70.0,
               54: 2584.0,
               55: 0.0,
               56: 324.0,
               57: 0.0,
               58: 605.0,
               59: 0.0,
               60: 0.0,
               61: 0.0,
               62: 3989.0,
               63: 10.0,
               64: 42.0,
               65: 0.0,
               66: 904.0,
               67: 0.0,
               68: 88.0,
               69: 70.0,
               70: 8172.0,
               71: 0.0,
               72: 0.0,
               73: 0.0,
               74: 64902.0,
               75: 0.0,
               76: 347.0,
               77: 0.0,
               78: 36605.0,
               79: 0.0,
               80: 379.0,
               81: 70.0,
               82: 0.0,
               83: 0.0,
               84: 3001.0,
               85: 0.0,
               86: 1630.0,
               87: 7.0,
               88: 364.0,
               89: 0.0,
               90: 67404.0,
               91: 9.0,
               92: 0.0,
               93: 0.0,
               94: 7685.0,
               95: 0.0,
               96: 1017.0,
               97: 0.0,
               98: 2831.0,
               99: 0.0,
               100: 2963.0,
               101: 0.0,
               102: 854.0,
               103: 0.0,
               104: 0.0,
               105: 0.0,
               106: 0.0,
               107: 0.0,
               108: 0.0,
               109: 0.0,
               110: 0.0,
               111: 0.0,
               112: 0.0,
               113: 0.0,
               114: 0.0,
               115: 0.0,
               116: 0.0,
               117: 0.0,
               118: 0.0,
               119: 0.0,
               120: 0.0,
               121: 0.0,
               122: 0.0,
               123: 0.0,
               124: 0.0,
               125: 0.0,
               126: 67744.0,
               127: 22.0,
               128: 264.0,
               129: 0.0,
               260: 197.0,
               268: 0.0,
               265: 0.0,
               269: 0.0,
               261: 0.0,
               266: 1198.0,
               267: 0.0,
               262: 2629.0,
               258: 775.0,
               257: 0.0,
               263: 0.0,
               259: 0.0,
               264: 163.0,
               250: 10326.0,
               251: 0.0,
               252: 1228.0,
               253: 0.0,
               254: 2769.0,
               255: 0.0}

        # smoke test for the repr
        s = Series(ser)
        result = s.value_counts()
        str(result)
Example #30
0
# __tz_countのPLOT__________________________ 
tz_counts[:10].plot(kind='barh',rot=0)
import matplotlib.pyplot as plt
# plt.show()



# __要素数カウント__________________________
frame['a'][1]
frame['a'][50]
frame['a'][51]
results=Series([x.split()[0] for x in frame.a.dropna()])   #.dropna() pandasメソッド 空白行を削除 引数で削除する行指定
   #str.split(x) xを区切り文字にしてstrを分割してリストに収める
   #空白で区切った文字列をリストに収めて(リスト内法表記)、Seriesクラスでpandas dataframeにする
results[:5]
results.value_counts()[:8]   #value_counts()で同じ要素の数を数える


## __要素数カウント(別の方法)__________________________
cframe=frame[frame.a.notnull()]   #frameのa列のnullじゃない奴だけ集めた(cframe['a']==frame.a.dropna())
bool(map(list,[cframe['a'],frame.a.dropna()]))   #list関数をcframe['a']とframe.a.dropna()に適用させて同じかどうか見る





# __'Windows' or Not?__________________________
import numpy as np
operating_system=np.where(cframe['a'].str.contains('Windows'),'Windows','Not Windows')   #cframe['a']が'Windows'という文字を含む Trueで'Windows' falseで'Not Windows'を返す
   #` ['Windows' if 'Windows' in x else 'Not Windows' for x in cframe['a']]`と同じ
operating_system[:5]
Example #31
0
def get_overexpressed_genes(matrix: ExpMatrix,
                            cell_labels: pd.Series,
                            exp_thresh: float = 0.05,
                            ignore_outliers: bool = True,
                            num_genes: int = 20) -> pd.DataFrame:
    """Determine most over-expressed genes for each cluster."""

    # make sure matrix and cell_labels are aligned
    matrix = matrix.loc[:, cell_labels.index]

    if ignore_outliers:
        # ignore the cluster named "Outliers", if it exists
        sel = (cell_labels != 'Outliers')
        matrix = matrix.loc[:, sel]
        cell_labels = cell_labels.loc[sel]

    _LOGGER.info('Ignoring mean expression values below %.3f', exp_thresh)

    data = []

    # scale matrix
    matrix = matrix.scale()

    # determine fold-changes for all clusters
    vc = cell_labels.value_counts()
    clusters = vc.index.tolist()
    X = np.zeros((len(clusters), matrix.num_genes), dtype=np.float32)
    cluster_mean = ExpMatrix(genes=matrix.genes, cells=clusters, data=X.T)
    for l in clusters:
        sel = (cell_labels == l)
        cluster_mean.loc[:, l] = matrix.loc[:, sel].mean(axis=1)

    # in calculation of fold change,
    # ignore all expression values below exp_thresh
    thresh_cluster_mean = cluster_mean.copy()
    thresh_cluster_mean[thresh_cluster_mean < exp_thresh] = exp_thresh

    # calculate fold change relative to average of other clusters
    X = np.ones((len(clusters), matrix.num_genes), dtype=np.float32)
    fold_change = ExpMatrix(genes=matrix.genes, cells=clusters, data=X.T)
    for l in clusters:
        sel = (thresh_cluster_mean.cells != l)
        fold_change.loc[:, l] = thresh_cluster_mean.loc[:, l] / \
                (thresh_cluster_mean.loc[:, sel].mean(axis=1))

    markers = []
    for l in clusters:
        change = fold_change.loc[:, l].sort_values(ascending=False)
        change = change[:num_genes]

        # scale mean expression values to 10K transcripts
        mean = cluster_mean.loc[change.index, l]
        mean = (10000 / cluster_mean.loc[:, l].sum()) * mean

        cluster_index = [l] * num_genes
        gene_index = change.index
        index = pd.MultiIndex.from_arrays([cluster_index, gene_index],
                                          names=['cluster', 'gene'])

        data = np.c_[change.values, mean.values]

        markers.append(
            pd.DataFrame(index=index,
                         columns=['Fold change', 'Mean expression (TP10K)'],
                         data=data))

    markers = pd.concat(markers, axis=0)

    #markers = markers.swaplevel(0, 1).sort_index(
    #    level=1, sort_remaining=False).swaplevel(0, 1)

    return markers
Example #32
0
}
f = DataFrame(dic,index=np.arange(100,80,-1))
f2 = DataFrame({'line03':np.linspace(30,35,10),'line04':np.arange(10)},index=np.arange(100,90,-1));
f3 = f.add(f2)
# 求和
# 求每一列的和
sum = f.sum()
# 求指定列的和
sum1 = f[['line01','line02']].sum()
# 求每一行的和
sum2 = f.sum(axis=1)
# print sum
# print sum1
# print sum2
# 若该行或列有NaN则运行结果为NaN,默认skipna为True,忽略NaN
sum4 = f3.sum(skipna=False)
# print sum4

# 获取所有值
S2 = Series(['c','d','a','c','c','c','r','a','d'])
uniques = S2.unique()
# print uniques
# 获取每个值出现的次数
uniques_counts = S2.value_counts()
# print uniques_counts

# 获取DataFrame中多个列出现的信息
result = f.apply(pd.value_counts).fillna(0)
print result

Example #33
0
# Create dataframe of records
# Tabular 2-by-2 spreadsheet table
frame = DataFrame(records)
# Get histogram (counts) easily
tz_counts = frame['tz'].value_counts()
# Check output 
# print tz_counts[:10]
# Data Munging (Clean the data)
clean_tz = frame['tz'].fillna('Missing')
clean_tz[clean_tz == ''] = 'Unknown'
# Check output
# print clean_tz.value_counts()[:10]
# Getting a plot
import matplotlib.pyplot as plt 
tz_counts[:10].plot(kind='barh', rot=0)
# show all plots
# plt.show()

# PARSING DATA
# Example of data below 
# u'GoogleMaps/RochesterNY'
# u'Mozilla/5.0 (Windows NT 5.1; rv:10.0.2) Gecko/20100101 Firefox/10.0.2'
# u'Mozilla/5.0 (Linux; U; Android 2.2.2; en-us; LG-P925/V10e Build/FRG83G) AppleWebKit/533.1 (KHTML,
# nan
# frame.a --> accesses key a of dataframe
# .dropna() --> ignores values where nan
# .split() --> results in a list of the values split by whitespaces
results = Series([x.split()[0] for x in frame.a.dropna()])
# print results[:10]
print results.value_counts()[:8]
Example #34
0
countingsort(randint(0,100,100))


# # Segemente des Codes

# In[457]:

arr = [2,5,3,0,2,3,0,3]
arr


# In[458]:

hist = Series(arr)
hist
sorted_hist = hist.value_counts().sort_index()


# In[459]:

sorted_hist


# In[460]:

cleaned_hist = Series(sorted_hist, index=range(max(sorted_hist.index)+1))
cleaned_hist


# In[461]:
Example #35
0
def is_categorical_column(
    data: pd.Series,
    valid_data: pd.Series,
    threshold: int = None,
    ratio: Optional[float] = None,
    oov_ratio_threshold: Optional[float] = None,
    is_label: bool = False,
) -> bool:
    """
    Identify whether a column is one categorical column.
    If the number of unique elements in the column is smaller than

        min(#Total Sample * ratio, threshold),

    it will be treated as a categorical column.

    Parameters
    ----------
    data
        One column of a multimodal pd.DataFrame for training.
    valid_data
        One column of a multimodal pd.DataFrame for validation.
    threshold
        The threshold for detecting categorical column.
    ratio
        The ratio detecting categorical column.
    oov_ratio_threshold
        The out-of-vocabulary ratio between training and validation.
        This is used to determine if the column is a categorical column.
        Usually, a categorical column can tolerate a small OOV ratio.
    is_label
        Whether the column is a label column.

    Returns
    -------
    Whether the column is a categorical column.
    """
    if data.dtype.name == "category":
        return True
    else:
        if threshold is None:
            if is_label:
                threshold = 100
                oov_ratio_threshold = 0
                ratio = 0.1
            else:
                threshold = 20
                oov_ratio_threshold = 0
                ratio = 0.1
        threshold = min(int(len(data) * ratio), threshold)
        data_value_counts = data.value_counts(dropna=False)
        key_set = set(data_value_counts.keys())
        if len(data_value_counts) < threshold:
            valid_value_counts = valid_data.value_counts(dropna=False)
            total_valid_num = len(valid_data)
            oov_num = 0
            for k, v in zip(valid_value_counts.keys(),
                            valid_value_counts.values):
                if k not in key_set:
                    oov_num += v
            if is_label and oov_num != 0:
                return False
            if oov_num / total_valid_num > oov_ratio_threshold:
                return False
            return True
        return False
Example #36
0
def countingsort(arr):
    
    #Umwandlung in Pandas.Series für Histogrammbildung
    hist = Series(arr)
    sns.distplot(hist,kde=False,rug=True,color='royalblue', bins=max(hist.index)*3, label=r'Häufigkeit')
    plt.ylim(0, max(hist.index)+1)
    plt.xlabel('Element')
    plt.ylabel(r'Häufigkeit')
    plt.title(r'$\mathrm{Array\ Histogram}$')
    plt.show()
    
    
    
    #Sortiertes Histogramm, fehlende Werte (NaN) werden durch 0 ersetzt
    sorted_hist = hist.value_counts().sort_index()
    cleaned_hist = Series(sorted_hist, index=range(max(sorted_hist.index)+1)).fillna(0)

    
    #Aufsummierung der Werte im Histogramm
    summed_hist = Series(cleaned_hist.cumsum()[:-1].values, index=range(1, max(arr)+1))
    
    #Bereinigung des summierten Histogramms
    summed_hist_cleaned = Series(summed_hist, index=range(max(arr)+1)).fillna(0)
    
    
    
    
    #Kreiere DataFrames zu A, B und Hilfsarray C
    
    #DataFrame A
    rows = len(arr)            #Anzahl Reihen
    columns_A = []             #Anzahl Spalten
    
    #Benenne die Spalten für A
    for num in range(rows):
        columns_A.append('A[' + str(num) + ']')
        
    #Kreiere DataFrame
    dframe_A = DataFrame(np.array(list(arr)*rows).reshape(rows, rows), columns=columns_A, index=range(rows))
    
    
    #Das Gleiche nun für Hilfsarray C
    hilfs_array = np.array(summed_hist_cleaned.values)
    columns = len(hilfs_array)
    columns_C = []
    for num in range(columns):
        columns_C.append('C[' + str(num) + ']')
    dframe_C = DataFrame(np.array(list(hilfs_array)*rows).reshape(rows ,columns), index=range(rows), columns=columns_C)
    
    
    #Fertige zunächst LEERES DataFrame B an
    columns_B = []
    for num in range(rows):
        columns_B.append('B[' + str(num) + ']')    

    dframe_B = DataFrame(np.nan, index=range(rows),columns=columns_B).fillna(' ')
    
    
    #Kreiere Dict, in dem Keys und Values für das später fertig sortierte Array B angelegt werden 
    b = {}
    lookup_value = 0
    for i in range(rows):
        #Iteration der Werte in C sobald in A nachgeschlagen 
        if i > 0:
            dframe_C['C['+str(lookup_value)+']'][i:] += 1
        
        #Wert, der in C nachgeschlagen, und in B an Stelle C[A[i]] eingefügt werden soll
        lookup_value = dframe_A.values[i][i]
        key = 'B[' + str(int(dframe_C.values[i][lookup_value])) + ']'
        b[key] = [lookup_value,i]
    
    
    #Füge Werte schließlich sortiert in B ein
    for key, value in b.items():
        dframe_B[key][value[1]:] = value[0]    
    
    #Konkatenieren der 3 DataFrames
    final_dframe = pd.concat([dframe_A, dframe_C, dframe_B], axis=1)
    
    
    #Sortiertes Array B
    result = []
    for i in range(len(b)):
        result.append(b['B[' + str(i) + ']'][0])

    #print('\nDataFrame A\n')
    #display(dframe_A)
    #print('\nDataFrame C\n')
    #display(dframe_C)
    #print('\nDataFrame B\n')
    #display(dframe_B)
    
    #display(final_dframe)
    
    return result
Example #37
0
    def test_value_counts_bins(self, klass):
        s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a']
        s = klass(s_values)

        # bins
        with pytest.raises(TypeError):
            s.value_counts(bins=1)

        s1 = Series([1, 1, 2, 3])
        res1 = s1.value_counts(bins=1)
        exp1 = Series({Interval(0.997, 3.0): 4})
        tm.assert_series_equal(res1, exp1)
        res1n = s1.value_counts(bins=1, normalize=True)
        exp1n = Series({Interval(0.997, 3.0): 1.0})
        tm.assert_series_equal(res1n, exp1n)

        if isinstance(s1, Index):
            tm.assert_index_equal(s1.unique(), Index([1, 2, 3]))
        else:
            exp = np.array([1, 2, 3], dtype=np.int64)
            tm.assert_numpy_array_equal(s1.unique(), exp)

        assert s1.nunique() == 3

        # these return the same
        res4 = s1.value_counts(bins=4, dropna=True)
        intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0])
        exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2]))
        tm.assert_series_equal(res4, exp4)

        res4 = s1.value_counts(bins=4, dropna=False)
        intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0])
        exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2]))
        tm.assert_series_equal(res4, exp4)

        res4n = s1.value_counts(bins=4, normalize=True)
        exp4n = Series([0.5, 0.25, 0.25, 0],
                       index=intervals.take([0, 3, 1, 2]))
        tm.assert_series_equal(res4n, exp4n)

        # handle NA's properly
        s_values = [
            'a', 'b', 'b', 'b', np.nan, np.nan, 'd', 'd', 'a', 'a', 'b'
        ]
        s = klass(s_values)
        expected = Series([4, 3, 2], index=['b', 'a', 'd'])
        tm.assert_series_equal(s.value_counts(), expected)

        if isinstance(s, Index):
            exp = Index(['a', 'b', np.nan, 'd'])
            tm.assert_index_equal(s.unique(), exp)
        else:
            exp = np.array(['a', 'b', np.nan, 'd'], dtype=object)
            tm.assert_numpy_array_equal(s.unique(), exp)
        assert s.nunique() == 3

        s = klass({})
        expected = Series([], dtype=np.int64)
        tm.assert_series_equal(s.value_counts(),
                               expected,
                               check_index_type=False)
        # returned dtype differs depending on original
        if isinstance(s, Index):
            tm.assert_index_equal(s.unique(), Index([]), exact=False)
        else:
            tm.assert_numpy_array_equal(s.unique(),
                                        np.array([]),
                                        check_dtype=False)

        assert s.nunique() == 0
# print cy_counts[:20]
# print l_counts[:20]

clean_tz = frame['tz'].fillna('Missing')

clean_tz[clean_tz == ''] = 'TZ Unknown'

tz_counts = clean_tz.value_counts()

print tz_counts[:10]

tz_counts[:10].plot(kind='barh', rot=0)

results = Series([x.split()[0] for x in frame.a.dropna()])

print results.value_counts()[:12]

cframe = frame[frame.a.notnull()]
operating_system = np.where(cframe['a'].str.contains('Windows'), 'Windows', 'Not Windows')

print operating_system[:10]

by_tz_os = cframe.groupby(['tz', operating_system])
agg_counts = by_tz_os.size().unstack().fillna(0)

print agg_counts[:10]

indexer = agg_counts.sum(1).argsort()

print agg_counts[:20]
Example #39
0
    def test_value_counts_inferred(self):
        klasses = [Index, Series]
        for klass in klasses:
            s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a']
            s = klass(s_values)
            expected = Series([4, 3, 2, 1], index=['b', 'a', 'd', 'c'])
            tm.assert_series_equal(s.value_counts(), expected)
            
            self.assert_numpy_array_equal(s.unique(), np.unique(s_values))
            self.assertEquals(s.nunique(), 4)
            # don't sort, have to sort after the fact as not sorting is platform-dep
            hist = s.value_counts(sort=False)
            hist.sort()
            expected = Series([3, 1, 4, 2], index=list('acbd'))
            expected.sort()
            tm.assert_series_equal(hist, expected)

            # sort ascending
            hist = s.value_counts(ascending=True)
            expected = Series([1, 2, 3, 4], index=list('cdab'))
            tm.assert_series_equal(hist, expected)

            # relative histogram.
            hist = s.value_counts(normalize=True)
            expected = Series([.4, .3, .2, .1], index=['b', 'a', 'd', 'c'])
            tm.assert_series_equal(hist, expected)

            # bins
            self.assertRaises(TypeError, lambda bins: s.value_counts(bins=bins), 1)

            s1 = Series([1, 1, 2, 3])
            res1 = s1.value_counts(bins=1)
            exp1 = Series({0.998: 4})
            tm.assert_series_equal(res1, exp1)
            res1n = s1.value_counts(bins=1, normalize=True)
            exp1n = Series({0.998: 1.0})
            tm.assert_series_equal(res1n, exp1n)

            self.assert_numpy_array_equal(s1.unique(), np.array([1, 2, 3]))
            self.assertEquals(s1.nunique(), 3)

            res4 = s1.value_counts(bins=4)
            exp4 = Series({0.998: 2, 1.5: 1, 2.0: 0, 2.5: 1}, index=[0.998, 2.5, 1.5, 2.0])
            tm.assert_series_equal(res4, exp4)
            res4n = s1.value_counts(bins=4, normalize=True)
            exp4n = Series({0.998: 0.5, 1.5: 0.25, 2.0: 0.0, 2.5: 0.25}, index=[0.998, 2.5, 1.5, 2.0])
            tm.assert_series_equal(res4n, exp4n)

            # handle NA's properly
            s_values = ['a', 'b', 'b', 'b', np.nan, np.nan, 'd', 'd', 'a', 'a', 'b']
            s = klass(s_values)
            expected = Series([4, 3, 2], index=['b', 'a', 'd'])
            tm.assert_series_equal(s.value_counts(), expected)

            self.assert_numpy_array_equal(s.unique(), np.array(['a', 'b', np.nan, 'd'], dtype='O'))
            self.assertEquals(s.nunique(), 3)

            s = klass({})
            expected = Series([], dtype=np.int64)
            tm.assert_series_equal(s.value_counts(), expected)
            self.assert_numpy_array_equal(s.unique(), np.array([]))
            self.assertEquals(s.nunique(), 0)

            # GH 3002, datetime64[ns]
            txt = "\n".join(['xxyyzz20100101PIE', 'xxyyzz20100101GUM', 'xxyyzz20100101EGG',
                             'xxyyww20090101EGG', 'foofoo20080909PIE', 'foofoo20080909GUM'])
            f = StringIO(txt)
            df = pd.read_fwf(f, widths=[6, 8, 3], names=["person_id", "dt", "food"],
                             parse_dates=["dt"])

            s = klass(df['dt'].copy())

            idx = pd.to_datetime(['2010-01-01 00:00:00Z', '2008-09-09 00:00:00Z', '2009-01-01 00:00:00X'])
            expected_s = Series([3, 2, 1], index=idx)
            tm.assert_series_equal(s.value_counts(), expected_s)

            expected = np.array(['2010-01-01 00:00:00Z', '2009-01-01 00:00:00Z', '2008-09-09 00:00:00Z'],
                                dtype='datetime64[ns]')
            if isinstance(s, DatetimeIndex):
                expected = DatetimeIndex(expected)
                self.assert_(s.unique().equals(expected))
            else:
                self.assert_numpy_array_equal(s.unique(), expected)

            self.assertEquals(s.nunique(), 3)

            # with NaT
            s = df['dt'].copy()
            s = klass([v for v in s.values] + [pd.NaT])

            result = s.value_counts()
            self.assertEqual(result.index.dtype, 'datetime64[ns]')
            expected_s[pd.NaT] = 1
            tm.assert_series_equal(result, expected_s)

            unique = s.unique()
            self.assertEqual(unique.dtype, 'datetime64[ns]')
            # numpy_array_equal cannot compare pd.NaT
            self.assert_numpy_array_equal(unique[:3], expected)
            self.assertTrue(unique[3] is pd.NaT or unique[3].astype('int64') == pd.tslib.iNaT)

            self.assertEquals(s.nunique(), 4)

            # timedelta64[ns]
            td = df.dt - df.dt + timedelta(1)
            td = klass(td)

            result = td.value_counts()
            expected_s = Series([6], index=[86400000000000])
            self.assertEqual(result.index.dtype, 'int64')
            tm.assert_series_equal(result, expected_s)

            # get nanoseconds to compare
            expected = np.array([86400000000000])
            self.assert_numpy_array_equal(td.unique(), expected)
            self.assertEquals(td.nunique(), 1)

            td2 = timedelta(1) + (df.dt - df.dt)
            td2 = klass(td2)
            result2 = td2.value_counts()

            self.assertEqual(result2.index.dtype, 'int64')
            tm.assert_series_equal(result2, expected_s)

            self.assert_numpy_array_equal(td.unique(), expected)
            self.assertEquals(td.nunique(), 1)
Example #40
0
from pandas import DataFrame, Series
import pandas as pd; import numpy as np
frame = DataFrame(records)
frame

#空欄、存在していない箇所⇒fillnaメソッドで'Missing'の文字列で置換
#存在するが中身が空文字列⇒真偽値の配列によるインデックス参照を使って'Unknown'に置換
clean_tz = frame['tz'].fillna('Missing')
clean_tz[clean_tz == ''] = 'Unknown'
tz_counts = clean_tz.value_counts()
tz_counts[:10]

#ユーザーエージェントの先頭トークンを切り出して表示する
results = Series([x.split()[0] for x in frame.a.dropna()])
results[:5]
results.value_counts()[:8] #上位8位のカウントを表示

#Windowsユーザと非Windowsユーザを分類
#Windowsユーザ分類条件:UAに'Windows'の文字列が含まれるか
#frmaeオブジェクトを基にしてUAが存在しないレコードを除外
cframe = frame[frame.a.notnull()]
#numpy.where()を使用
operating_system = np.where(cframe['a'].str.contains('Windows'),'Windows','Not Windows')
operating_system[:5]

#タイムゾーンと稼働OSの組み合わせごとにグループ化する
#pandas.DataFrame.unstack()を使う
by_tz_os = cframe.groupby(['tz', operating_system])
agg_counts = by_tz_os.size().unstack().fillna(0)

#昇順のソートを使う
Example #41
0
    def test_value_counts_bins(self):
        klasses = [Index, Series]
        for klass in klasses:
            s_values = ['a', 'b', 'b', 'b', 'b', 'c', 'd', 'd', 'a', 'a']
            s = klass(s_values)

            # bins
            pytest.raises(TypeError, lambda bins: s.value_counts(bins=bins), 1)

            s1 = Series([1, 1, 2, 3])
            res1 = s1.value_counts(bins=1)
            exp1 = Series({Interval(0.997, 3.0): 4})
            tm.assert_series_equal(res1, exp1)
            res1n = s1.value_counts(bins=1, normalize=True)
            exp1n = Series({Interval(0.997, 3.0): 1.0})
            tm.assert_series_equal(res1n, exp1n)

            if isinstance(s1, Index):
                tm.assert_index_equal(s1.unique(), Index([1, 2, 3]))
            else:
                exp = np.array([1, 2, 3], dtype=np.int64)
                tm.assert_numpy_array_equal(s1.unique(), exp)

            assert s1.nunique() == 3

            # these return the same
            res4 = s1.value_counts(bins=4, dropna=True)
            intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0])
            exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2]))
            tm.assert_series_equal(res4, exp4)

            res4 = s1.value_counts(bins=4, dropna=False)
            intervals = IntervalIndex.from_breaks([0.997, 1.5, 2.0, 2.5, 3.0])
            exp4 = Series([2, 1, 1, 0], index=intervals.take([0, 3, 1, 2]))
            tm.assert_series_equal(res4, exp4)

            res4n = s1.value_counts(bins=4, normalize=True)
            exp4n = Series([0.5, 0.25, 0.25, 0],
                           index=intervals.take([0, 3, 1, 2]))
            tm.assert_series_equal(res4n, exp4n)

            # handle NA's properly
            s_values = ['a', 'b', 'b', 'b', np.nan, np.nan,
                        'd', 'd', 'a', 'a', 'b']
            s = klass(s_values)
            expected = Series([4, 3, 2], index=['b', 'a', 'd'])
            tm.assert_series_equal(s.value_counts(), expected)

            if isinstance(s, Index):
                exp = Index(['a', 'b', np.nan, 'd'])
                tm.assert_index_equal(s.unique(), exp)
            else:
                exp = np.array(['a', 'b', np.nan, 'd'], dtype=object)
                tm.assert_numpy_array_equal(s.unique(), exp)
            assert s.nunique() == 3

            s = klass({})
            expected = Series([], dtype=np.int64)
            tm.assert_series_equal(s.value_counts(), expected,
                                   check_index_type=False)
            # returned dtype differs depending on original
            if isinstance(s, Index):
                tm.assert_index_equal(s.unique(), Index([]), exact=False)
            else:
                tm.assert_numpy_array_equal(s.unique(), np.array([]),
                                            check_dtype=False)

            assert s.nunique() == 0
Example #42
0
iris.target


# In[178]:

len(iris.target)


# In[179]:

Y=Series(iris.target)


# In[180]:

Y.value_counts()


##### Clearly there are 3 groups of dependent variable values .Now we will try to depict these values graphically  

# In[181]:

iris.data.shape


##### For each of the dependent variable in Y , we have 4 independent variables - we will use 2 of the independent variables and try to plot the points on a plot

# In[182]:

X=DataFrame(iris.data[:,0:2])
    # TODO: Use the interquartile range to calculate an outlier step (1.5 times the interquartile range)
    step = 1.5*(Q3 - Q1)
    print "step = %1.2f" % step
    
    # Display the outliers
    print "Data points considered outliers for the feature '{}':".format(feature)
    df_outlier = log_data[~((log_data[feature] >= Q1 - step) & (log_data[feature] <= Q3 + step))]
    #display(df_outlier) # switching this off for better graphical output
    print "Switching this off for better graphical output\n"

    # OPTIONAL: Select the indices for data points you wish to remove
    for i in df_outlier.index:
        out_liers.append(i)

s = Series(out_liers) # convert outliers into a Series object
s_vc = s.value_counts() # use value_counts method to group by same outlier index
valid = [i for i in range(log_data.shape[0]) if not(i in s_vc[s_vc>1])] # keep all indices that have at most 1 outlier feature

# Remove the outliers, if any were specified
# good_data = log_data.drop(log_data.index[outliers]).reset_index(drop = True)
good_data = log_data.ix[valid,:]

print "Potential outliers = ", len(s_vc)
print "'True' outliers"
print s_vc[s_vc>1]

print "Original data = ", log_data.shape[0]
print "Data without outliers = ", good_data.shape[0]


    'The Man with the Twisted Lip', 'The Adventure of the Blue Carbuncle',
    'The Adventure of the Speckled Band',
    "The Adventure of the Engineer's Thumb",
    'The Adventure of the Noble Bachelor',
    'The Adventure of the Beryl Coronet', 'The Adventure of the Copper Beeches'
]

titles = [title.lower() for title in titles]

shortStoryCounts = []
for i, title in enumerate(titles):
    shortStory = sherlockTexts[title]
    tokenizedStory = nltk.word_tokenize(shortStory)
    tokenizedStory = [word for word in tokenizedStory if word.isalnum()]
    tokenSeries = Series(tokenizedStory)
    shortStoryCounts.append(tokenSeries.value_counts())

df = pd.concat(shortStoryCounts, axis=1, sort=False)
dtm = df.T

# First lets get the term frequencies. These are just the raw terms in the dtm
# divided by the length of the document.
documentLengths = dtm.sum(axis=1)  # Add up the word count for all the words!
frequencyDtm = dtm.div(documentLengths, axis='index')
# Replace NaN values with 0 (otherwise the math won't work)
frequencyDtm = frequencyDtm.fillna(0)

# Get a Series which tells you how many Documents have the term
docsWithTerm = dtm.count()

# Get the weight of the term (total number of documents divided by number of