Example #1
0
    def test_fillna_categorical_nan(self):
        # GH 14021
        # np.nan should always be a valid filler
        cat = Categorical([np.nan, 2, np.nan])
        val = Categorical([np.nan, np.nan, np.nan])
        df = DataFrame({"cats": cat, "vals": val})
        res = df.fillna(df.median())
        v_exp = [np.nan, np.nan, np.nan]
        df_exp = DataFrame({"cats": [2, 2, 2], "vals": v_exp},
                           dtype='category')
        tm.assert_frame_equal(res, df_exp)

        result = df.cats.fillna(np.nan)
        tm.assert_series_equal(result, df.cats)
        result = df.vals.fillna(np.nan)
        tm.assert_series_equal(result, df.vals)

        idx = pd.DatetimeIndex(['2011-01-01 09:00', '2016-01-01 23:45',
                                '2011-01-01 09:00', pd.NaT, pd.NaT])
        df = DataFrame({'a': Categorical(idx)})
        tm.assert_frame_equal(df.fillna(value=pd.NaT), df)

        idx = pd.PeriodIndex(['2011-01', '2011-01', '2011-01',
                              pd.NaT, pd.NaT], freq='M')
        df = DataFrame({'a': Categorical(idx)})
        tm.assert_frame_equal(df.fillna(value=pd.NaT), df)

        idx = pd.TimedeltaIndex(['1 days', '2 days',
                                 '1 days', pd.NaT, pd.NaT])
        df = DataFrame({'a': Categorical(idx)})
        tm.assert_frame_equal(df.fillna(value=pd.NaT), df)
class LogAggregate:
    def __init__(self, dataset):
        self.dataset = DataFrame(dataset)

    def get_median(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).median()[kwarg['key']]
        else:
            return self.dataset.median()[kwarg['key']]

    def get_average(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).mean()[kwarg['key']]
        else:
            return self.dataset.mean()[kwarg['key']]

    def get_min(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).min()[kwarg['key']]
        else:
            return self.dataset.min()[kwarg['key']]
    
    def get_max(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).max()[kwarg['key']]
        else:
            return self.dataset.max()[kwarg['key']]

    def get_count(self, *arg, **kwarg):
        if kwarg.has_key('group_by'):
            return self.dataset.groupby(kwarg['group_by']).count()[kwarg['key']]
        else:
            return self.dataset.count()[kwarg['key']]
Example #3
0
    def test_quantile(self, datetime_frame):
        from numpy import percentile

        df = datetime_frame
        q = df.quantile(0.1, axis=0)
        assert q['A'] == percentile(df['A'], 10)
        tm.assert_index_equal(q.index, df.columns)

        q = df.quantile(0.9, axis=1)
        assert (q['2000-01-17'] ==
                percentile(df.loc['2000-01-17'], 90))
        tm.assert_index_equal(q.index, df.index)

        # test degenerate case
        q = DataFrame({'x': [], 'y': []}).quantile(0.1, axis=0)
        assert(np.isnan(q['x']) and np.isnan(q['y']))

        # non-numeric exclusion
        df = DataFrame({'col1': ['A', 'A', 'B', 'B'], 'col2': [1, 2, 3, 4]})
        rs = df.quantile(0.5)
        xp = df.median().rename(0.5)
        assert_series_equal(rs, xp)

        # axis
        df = DataFrame({"A": [1, 2, 3], "B": [2, 3, 4]}, index=[1, 2, 3])
        result = df.quantile(.5, axis=1)
        expected = Series([1.5, 2.5, 3.5], index=[1, 2, 3], name=0.5)
        assert_series_equal(result, expected)

        result = df.quantile([.5, .75], axis=1)
        expected = DataFrame({1: [1.5, 1.75], 2: [2.5, 2.75],
                              3: [3.5, 3.75]}, index=[0.5, 0.75])
        assert_frame_equal(result, expected, check_index_type=True)

        # We may want to break API in the future to change this
        # so that we exclude non-numeric along the same axis
        # See GH #7312
        df = DataFrame([[1, 2, 3],
                        ['a', 'b', 4]])
        result = df.quantile(.5, axis=1)
        expected = Series([3., 4.], index=[0, 1], name=0.5)
        assert_series_equal(result, expected)
def cross_validate_trades(trades, N = 20, subset_fraction = 0.7):
    
    tickers = trades.tickers
    sample_size = round(len(tickers) * subset_fraction)
    summary = DataFrame(dtype = float)

    for n in range(N):
        sample_tickers = list(random.choice(tickers, sample_size, replace = False))
        trade_subset = trades.find(lambda T: T.ticker in sample_tickers)
        summary[n] = summary_report(trade_subset)

    result = DataFrame(dtype = float)
    result['Base'] = summary_report(trades)
    result['Mean'] = summary.mean(axis = 1)
    result['Std'] = summary.std(axis = 1)
    result['Median'] = summary.median(axis = 1)
    result['Max'] = summary.max(axis = 1)
    result['Min'] = summary.min(axis = 1)

    return (result, summary)
Example #5
0
import numpy as np
import scipy as sc
from scipy import stats
import matplotlib.pyplot as plt
import pandas as pd
from pandas import DataFrame, Series
db = pd.read_csv("/users/rosiezou/Desktop/mortgage-stanley/FMAC-5US.csv")
table = DataFrame(db, columns = ['Date', 'Value'])
plt.plot(db['Date'], db['Value'], 'bo')
regressionline = sc.stats.linregress(db['Date'], db['Value'])
m = regressionline[0]
b = regressionline[1]
x = np.linspace(0, 18, 100)
plt.plot(x, m*x + b)
plt.show()
print(table.median(0))
print(table.mode(0))
Example #6
0
# after prepaired data, time to plot it:

for new_counter in range(file_counter+1):
    #print new_counter
    Qbers = final_data[(final_data["Dataset"]==new_counter) & (final_data["Qber"] > 0) ]
    x1 = Qbers.index.tolist()
    y1 = Qbers["Qber"].tolist()
    x1_average = DataFrame.mean(Qbers)["Qber"]
    x1_std_dev = DataFrame.std(Qbers)["Qber"]
    #prepairing proper time:
    x1[:] = [x - quelle_initialTimestamps[new_counter] for x in x1]
    
    Raws = final_data[(final_data["Dataset"]==new_counter) & (final_data["Raw key"] > 0) ]
    x2_average = DataFrame.mean(Raws)["Raw key"]
    x2_median = DataFrame.median(Raws)["Raw key"]
    x2_max = DataFrame.max(Raws)["Raw key"]
    
    Raws = Raws[Raws["Raw key"]<(x2_max - (x2_max/100)*20)]
    
    x2 = Raws.index.tolist()
    y2 = Raws["Raw key"].tolist()

    print x2_average
    #x2_std_dev = 3
    #once again correcting counter:
    x2[:] = [x - quelle_initialTimestamps[new_counter] for x in x2]
    #print x1[0], x2[0], quelle_initialTimestamps[new_counter]
    # Two subplots, the axes array is 1-d http://matplotlib.org/examples/pylab_examples/subplots_demo.html
    f, axarr = plt.subplots(2, sharex=True)
    axarr[0].grid()