def deal_outlier():
    np.random.seed(12345)
    data=DataFrame(np.random.randn(1000,4))
    print data.describe()
    print data[3][np.abs(data[3])>3]
    print data[(np.abs(data)>3).any(1)]
    data[np.abs(data)>3]=np.sign(data)*3
    print data.describe()
def slide_15():
    np.random.seed(12345)
    data = DataFrame(np.random.randn(1000, 4))
    print data.describe()

    col = data[3]
    print col[np.abs(col) > 3]

    print data[(np.abs(data) > 3).any(1)]
    data[np.abs(data) > 3] = np.sign(data) * 3
    print data.describe()
Example #3
0
def pd_04():
    obj=Series(range(4),index=['d','a','b','c'])
    print obj
    print obj.sort_index()
    frame=DataFrame(np.arange(8).reshape(2,4),index=['three','one'],columns=['d','a','b','c'])
    print frame.sort_index()
    print frame.sort_index(axis=1)
    print frame.sort_index(ascending=False)
    obj1=Series([4,7,-3,2])
    print obj1.order()
    print frame.sort_index(by='b')
    print frame.sort_index(by=['a','b'])
    print frame.describe()
Example #4
0
    def test_describe_no_numeric(self):
        df = DataFrame({'A': ['foo', 'foo', 'bar'] * 8,
                        'B': ['a', 'b', 'c', 'd'] * 6})
        desc = df.describe()
        expected = DataFrame(dict((k, v.describe())
                                  for k, v in compat.iteritems(df)),
                             columns=df.columns)
        assert_frame_equal(desc, expected)

        ts = tm.makeTimeSeries()
        df = DataFrame({'time': ts.index})
        desc = df.describe()
        self.assertEqual(desc.time['first'], min(ts.index))
Example #5
0
class Describe(object):

    def setup(self):
        self.df = DataFrame({
            'a': np.random.randint(0, 100, int(1e6)),
            'b': np.random.randint(0, 100, int(1e6)),
            'c': np.random.randint(0, 100, int(1e6))
        })

    def time_series_describe(self):
        self.df['a'].describe()

    def time_dataframe_describe(self):
        self.df.describe()
Example #6
0
    def calcCVA(self, expectedExposure=array):
        cvaData = DataFrame()
        cvaData['t'] = self.timesteps[1:]
        cvaData['discountFactor'] = self.discountFactors
        cvaData['pd'] = self.pD[1:]
        cvaData['1-R'] = [1 - self.recoveryRate] * len(self.pD[1:])
        cvaData['exposure'] = [getTPlusFromList(expectedExposure, i, True) for i in range(len(expectedExposure))]
        cvaData['cvaPerTimeStep'] = cvaData['discountFactor'] * cvaData['pd'] * cvaData['1-R'] * cvaData['exposure']
        cva = cvaData['cvaPerTimeStep'].sum()
        cvaData.describe()
        print cvaData
        print 'CVA = ', cva
        return cva

        return True
Example #7
0
def descriptiveStatsDataFrame():
    df = DataFrame([[1.4, np.nan], [7, 5], [np.nan, np.nan], [7,10]], index=['a','b','c','d'], columns=['one','two'])
    print (df)
    print ('Column Sum: \n{}'.format(df.sum(axis=0)))
    print ('Row Sum: \n{}'.format(df.sum(axis=1)))
    print ('Do not skip NA: \n{}'.format(df.sum(axis=1, skipna=False)))
    print ('Index with min Value: \n{}'.format(df.idxmin()))
    print ('Summary Statistic: \n{}'.format(df.describe()))
Example #8
0
    def test_iloc_getitem_doc_issue(self):

        # multi axis slicing issue with single block
        # surfaced in GH 6059

        arr = np.random.randn(6, 4)
        index = date_range('20130101', periods=6)
        columns = list('ABCD')
        df = DataFrame(arr, index=index, columns=columns)

        # defines ref_locs
        df.describe()

        result = df.iloc[3:5, 0:2]
        str(result)
        result.dtypes

        expected = DataFrame(arr[3:5, 0:2], index=index[3:5],
                             columns=columns[0:2])
        tm.assert_frame_equal(result, expected)

        # for dups
        df.columns = list('aaaa')
        result = df.iloc[3:5, 0:2]
        str(result)
        result.dtypes

        expected = DataFrame(arr[3:5, 0:2], index=index[3:5],
                             columns=list('aa'))
        tm.assert_frame_equal(result, expected)

        # related
        arr = np.random.randn(6, 4)
        index = list(range(0, 12, 2))
        columns = list(range(0, 8, 2))
        df = DataFrame(arr, index=index, columns=columns)

        df._data.blocks[0].mgr_locs
        result = df.iloc[1:5, 2:4]
        str(result)
        result.dtypes
        expected = DataFrame(arr[1:5, 2:4], index=index[1:5],
                             columns=columns[2:4])
        tm.assert_frame_equal(result, expected)
Example #9
0
def combine_models():

    model_nums = [1, 2, 3]
    # model_nums = [2, 3]
    model_paths = ['model%03d.y_test.csv' % i for i in model_nums]
    assert all(os.path.exists(path) for path in model_paths)
    # y1 = pd.read_csv('model001.y_test.csv').set_index('job_id')
    # y2 = pd.read_csv('model002.y_test.csv').set_index('job_id')
    # y3 = pd.read_csv('model003.y_test.csv').set_index('job_id')

    models = [pd.read_csv(path).set_index('job_id') for path in model_paths]

    path = 'all/jobs_all.csv'
    df = get_data(path)
    df_train, df_test = split_train_test(df)
    y_data = np.ones((len(df_test), len(models)), dtype=int) * -1
    y = DataFrame(y_data, columns=model_nums, index=df_test.index)

    for d in [y] + models:
        print(d.describe())
    for d in [y] + models:
        print(d.shape, len(y) - len(d), type(d))

    y_indexes = set(y.index)
    print('y_indexes: %s' % sorted(y_indexes)[:10])

    for c, d in zip(model_nums, models):
        d_indexes = set(d.index)
        print('c=%s, d_indexes: %s' % (c, sorted(d_indexes)[:10]))
        assert d_indexes.issubset(y_indexes), (len(d_indexes - y_indexes))
        y[c].loc[d.index] = d['hat']

    def func(row):
        return all(x == -1 for x in row)

    # empties = y.apply(func, axis=1)
    # print('empties: %d' % len(empties))
    # print(y[empties])

    def vote(row):
        print(row)
        for j in 1, 3, 2:
            if row[j] != -1:
                return row[j]
        # assert False, row
        return -3

    print(y.iloc[:20, :])
    y_series = y.iloc[:20, :].apply(vote, axis=1)
    assert False
    y_test = DataFrame(y_series, columns=['hat'], index=y.index)
    y_test.to_csv('%s.y_test.csv' % 'model004v', index_label='job_id')
    print(y_test.columns)
    print(y_test.describe())
    print(y_test.iloc[:10, :])
Example #10
0
    def test_describe_objects(self):
        df = DataFrame({"C1": ['a', 'a', 'c'], "C2": ['d', 'd', 'f']})
        result = df.describe()
        expected = DataFrame({"C1": [3, 2, 'a', 2], "C2": [3, 2, 'd', 2]},
                             index=['count', 'unique', 'top', 'freq'])
        assert_frame_equal(result, expected)

        df = DataFrame({"C1": pd.date_range('2010-01-01', periods=4, freq='D')})
        df.loc[4] = pd.Timestamp('2010-01-04')
        result = df.describe()
        expected = DataFrame({"C1": [5, 4, pd.Timestamp('2010-01-01'),
                                     pd.Timestamp('2010-01-04'),
                                     pd.Timestamp('2010-01-04'), 2]},
                             index=['count', 'unique', 'first', 'last', 'top',
                                    'freq'])
        assert_frame_equal(result, expected)

        # mix time and str
        df['C2'] = ['a', 'a', 'b', 'c', 'a']
        result = df.describe()
        # when mix of dateimte / obj the index gets reordered.
        expected['C2'] = [5, 3, np.nan, np.nan, 'a', 3]
        assert_frame_equal(result, expected)

        # just str
        expected = DataFrame({'C2': [5, 3, 'a', 4]},
                             index=['count', 'unique', 'top', 'freq'])
        result = df[['C2']].describe()

        # mix of time, str, numeric
        df['C3'] = [2, 4, 6, 8, 2]
        result = df.describe()
        expected = DataFrame({"C3": [5., 4.4, 2.607681, 2., 2., 4., 6., 8.]},
                             index=['count', 'mean', 'std', 'min', '25%',
                                    '50%', '75%', 'max'])
        assert_frame_equal(result, expected)
        assert_frame_equal(df.describe(), df[['C3']].describe())

        assert_frame_equal(df[['C1', 'C3']].describe(), df[['C3']].describe())
        assert_frame_equal(df[['C2', 'C3']].describe(), df[['C3']].describe())
Example #11
0
def messages_data(soup,message_csv):
    messages = scrape_element(soup, 'messages', '.Message')
    msg_lengths = []
    pd.set_option('display.max_colwidth', -1)
    for k, v in messages.items():
        msg_lengths.append(len(v))
        text = Series(str(np.array(v.encode('utf-8'))))
        print text
        text.to_csv(message_csv, sep=',', header=False, index=False, mode='a')
    df_msg_lgth = DataFrame(msg_lengths)
    df_msg_describe = DataFrame(df_msg_lgth.describe()).T
    cols = df_msg_describe.columns
    df_msg_describe.columns = ['msg_' + c for c in cols]
    return df_msg_describe
Example #12
0
def edbSave():
    '获取客户剪切板中的edb代码,并调用接口获取edb指标的具体数据'

    # 获取客户剪切板中的代码及输入的起始与结束日期
    codes = getCodeFromClipboard()
    start = sDate()
    end = eDate()

    data = w.edb(codes, start, end, "Fill=Previous")
    datachg = [d.strftime('%y-%m-%d') for d in data.Times]
    df = DataFrame(data.Data, index=data.Codes, columns=datachg).T
    print('-' * 85)
    print(df)
    print('-' * 85)
    print('统计指标:')
    print(df.describe())
    print("sum", " " * 3, str(df.sum()).split(sep="    ")[1].rjust(10))
    return df
Example #13
0
def StatisticDescribeDatas():
    """
    对数据做描述性统计分析
    :return:
    """
    from matplotlib import pyplot as plt
    from pandas import DataFrame

    laborary_operation = ArchlaboraryOperation()
    laborary_operation.prepareForProjectDataTable()
    res, _ = laborary_operation.getResponse()
    if res:
        _res_data = laborary_operation.getData()
        columns = []
        datas = []
        for _h in _res_data['properties']:
            columns.append(_h.label)
        for _data_item in _res_data['records']:
            _tmp_array = []
            for _pi in _res_data['properties']:
                if _res_data['records'][_data_item][_pi.label]:
                    _val = _res_data['records'][_data_item][_pi.label].loadValueLabel()
                    _tmp_array.append(float(_val) if _val.isdigit() else _val)
            datas.append(_tmp_array)
        df = DataFrame(datas, columns=columns)
        fig = plt.figure()
        #ax = fig.add_subplot(1, 1, 1)
        ax = fig.add_axes([0.1, 0.1, 0.8, 0.8])
        _d = df.describe()
        print("echo describe ......")
        print(_d)
        ax.plot(_d, 'r--')
        for seq in range(0, len(_d)):
            ax.text(seq, _d.values[seq][0], "%s(%s)" % (str(_d.index[seq]), str(_d.values[seq][0])))
        buf = StringIO()
        plt.savefig(buf, dpi=50, fmt="png")
        response = make_response(buf.getvalue())
        response.headers['Content-Type'] = 'Image/png'
        return response
Example #14
0
    def test_column_dups_operations(self):

        def check(result, expected=None):
            if expected is not None:
                assert_frame_equal(result, expected)
            result.dtypes
            str(result)

        # assignment
        # GH 3687
        arr = np.random.randn(3, 2)
        idx = lrange(2)
        df = DataFrame(arr, columns=['A', 'A'])
        df.columns = idx
        expected = DataFrame(arr, columns=idx)
        check(df, expected)

        idx = date_range('20130101', periods=4, freq='Q-NOV')
        df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]],
                       columns=['a', 'a', 'a', 'a'])
        df.columns = idx
        expected = DataFrame(
            [[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]], columns=idx)
        check(df, expected)

        # insert
        df = DataFrame([[1, 1, 1, 5], [1, 1, 2, 5], [2, 1, 3, 5]],
                       columns=['foo', 'bar', 'foo', 'hello'])
        df['string'] = 'bah'
        expected = DataFrame([[1, 1, 1, 5, 'bah'], [1, 1, 2, 5, 'bah'],
                              [2, 1, 3, 5, 'bah']],
                             columns=['foo', 'bar', 'foo', 'hello', 'string'])
        check(df, expected)
        with assertRaisesRegexp(ValueError, 'Length of value'):
            df.insert(0, 'AnotherColumn', range(len(df.index) - 1))

        # insert same dtype
        df['foo2'] = 3
        expected = DataFrame([[1, 1, 1, 5, 'bah', 3], [1, 1, 2, 5, 'bah', 3],
                              [2, 1, 3, 5, 'bah', 3]],
                             columns=['foo', 'bar', 'foo', 'hello',
                                      'string', 'foo2'])
        check(df, expected)

        # set (non-dup)
        df['foo2'] = 4
        expected = DataFrame([[1, 1, 1, 5, 'bah', 4], [1, 1, 2, 5, 'bah', 4],
                              [2, 1, 3, 5, 'bah', 4]],
                             columns=['foo', 'bar', 'foo', 'hello',
                                      'string', 'foo2'])
        check(df, expected)
        df['foo2'] = 3

        # delete (non dup)
        del df['bar']
        expected = DataFrame([[1, 1, 5, 'bah', 3], [1, 2, 5, 'bah', 3],
                              [2, 3, 5, 'bah', 3]],
                             columns=['foo', 'foo', 'hello', 'string', 'foo2'])
        check(df, expected)

        # try to delete again (its not consolidated)
        del df['hello']
        expected = DataFrame([[1, 1, 'bah', 3], [1, 2, 'bah', 3],
                              [2, 3, 'bah', 3]],
                             columns=['foo', 'foo', 'string', 'foo2'])
        check(df, expected)

        # consolidate
        df = df.consolidate()
        expected = DataFrame([[1, 1, 'bah', 3], [1, 2, 'bah', 3],
                              [2, 3, 'bah', 3]],
                             columns=['foo', 'foo', 'string', 'foo2'])
        check(df, expected)

        # insert
        df.insert(2, 'new_col', 5.)
        expected = DataFrame([[1, 1, 5., 'bah', 3], [1, 2, 5., 'bah', 3],
                              [2, 3, 5., 'bah', 3]],
                             columns=['foo', 'foo', 'new_col', 'string',
                                      'foo2'])
        check(df, expected)

        # insert a dup
        assertRaisesRegexp(ValueError, 'cannot insert',
                           df.insert, 2, 'new_col', 4.)
        df.insert(2, 'new_col', 4., allow_duplicates=True)
        expected = DataFrame([[1, 1, 4., 5., 'bah', 3],
                              [1, 2, 4., 5., 'bah', 3],
                              [2, 3, 4., 5., 'bah', 3]],
                             columns=['foo', 'foo', 'new_col',
                                      'new_col', 'string', 'foo2'])
        check(df, expected)

        # delete (dup)
        del df['foo']
        expected = DataFrame([[4., 5., 'bah', 3], [4., 5., 'bah', 3],
                              [4., 5., 'bah', 3]],
                             columns=['new_col', 'new_col', 'string', 'foo2'])
        assert_frame_equal(df, expected)

        # dup across dtypes
        df = DataFrame([[1, 1, 1., 5], [1, 1, 2., 5], [2, 1, 3., 5]],
                       columns=['foo', 'bar', 'foo', 'hello'])
        check(df)

        df['foo2'] = 7.
        expected = DataFrame([[1, 1, 1., 5, 7.], [1, 1, 2., 5, 7.],
                              [2, 1, 3., 5, 7.]],
                             columns=['foo', 'bar', 'foo', 'hello', 'foo2'])
        check(df, expected)

        result = df['foo']
        expected = DataFrame([[1, 1.], [1, 2.], [2, 3.]],
                             columns=['foo', 'foo'])
        check(result, expected)

        # multiple replacements
        df['foo'] = 'string'
        expected = DataFrame([['string', 1, 'string', 5, 7.],
                              ['string', 1, 'string', 5, 7.],
                              ['string', 1, 'string', 5, 7.]],
                             columns=['foo', 'bar', 'foo', 'hello', 'foo2'])
        check(df, expected)

        del df['foo']
        expected = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]], columns=[
                             'bar', 'hello', 'foo2'])
        check(df, expected)

        # values
        df = DataFrame([[1, 2.5], [3, 4.5]], index=[1, 2], columns=['x', 'x'])
        result = df.values
        expected = np.array([[1, 2.5], [3, 4.5]])
        self.assertTrue((result == expected).all().all())

        # rename, GH 4403
        df4 = DataFrame(
            {'TClose': [22.02],
             'RT': [0.0454],
             'TExg': [0.0422]},
            index=MultiIndex.from_tuples([(600809, 20130331)],
                                         names=['STK_ID', 'RPT_Date']))

        df5 = DataFrame({'STK_ID': [600809] * 3,
                         'RPT_Date': [20120930, 20121231, 20130331],
                         'STK_Name': [u('饡驦'), u('饡驦'), u('饡驦')],
                         'TClose': [38.05, 41.66, 30.01]},
                        index=MultiIndex.from_tuples(
                            [(600809, 20120930),
                             (600809, 20121231),
                             (600809, 20130331)],
                            names=['STK_ID', 'RPT_Date']))

        k = pd.merge(df4, df5, how='inner', left_index=True, right_index=True)
        result = k.rename(
            columns={'TClose_x': 'TClose', 'TClose_y': 'QT_Close'})
        str(result)
        result.dtypes

        expected = (DataFrame([[0.0454, 22.02, 0.0422, 20130331, 600809,
                                u('饡驦'), 30.01]],
                              columns=['RT', 'TClose', 'TExg',
                                       'RPT_Date', 'STK_ID', 'STK_Name',
                                       'QT_Close'])
                    .set_index(['STK_ID', 'RPT_Date'], drop=False))
        assert_frame_equal(result, expected)

        # reindex is invalid!
        df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]],
                       columns=['bar', 'a', 'a'])
        self.assertRaises(ValueError, df.reindex, columns=['bar'])
        self.assertRaises(ValueError, df.reindex, columns=['bar', 'foo'])

        # drop
        df = DataFrame([[1, 5, 7.], [1, 5, 7.], [1, 5, 7.]],
                       columns=['bar', 'a', 'a'])
        result = df.drop(['a'], axis=1)
        expected = DataFrame([[1], [1], [1]], columns=['bar'])
        check(result, expected)
        result = df.drop('a', axis=1)
        check(result, expected)

        # describe
        df = DataFrame([[1, 1, 1], [2, 2, 2], [3, 3, 3]],
                       columns=['bar', 'a', 'a'], dtype='float64')
        result = df.describe()
        s = df.iloc[:, 0].describe()
        expected = pd.concat([s, s, s], keys=df.columns, axis=1)
        check(result, expected)

        # check column dups with index equal and not equal to df's index
        df = DataFrame(np.random.randn(5, 3), index=['a', 'b', 'c', 'd', 'e'],
                       columns=['A', 'B', 'A'])
        for index in [df.index, pd.Index(list('edcba'))]:
            this_df = df.copy()
            expected_ser = pd.Series(index.values, index=this_df.index)
            expected_df = DataFrame.from_items([('A', expected_ser),
                                                ('B', this_df['B']),
                                                ('A', expected_ser)])
            this_df['A'] = index
            check(this_df, expected_df)

        # operations
        for op in ['__add__', '__mul__', '__sub__', '__truediv__']:
            df = DataFrame(dict(A=np.arange(10), B=np.random.rand(10)))
            expected = getattr(df, op)(df)
            expected.columns = ['A', 'A']
            df.columns = ['A', 'A']
            result = getattr(df, op)(df)
            check(result, expected)

        # multiple assignments that change dtypes
        # the location indexer is a slice
        # GH 6120
        df = DataFrame(np.random.randn(5, 2), columns=['that', 'that'])
        expected = DataFrame(1.0, index=range(5), columns=['that', 'that'])

        df['that'] = 1.0
        check(df, expected)

        df = DataFrame(np.random.rand(5, 2), columns=['that', 'that'])
        expected = DataFrame(1, index=range(5), columns=['that', 'that'])

        df['that'] = 1
        check(df, expected)
Example #15
0
#2
data = np.random.randn(1000) # Normally distributed
cats = pd.qcut(data, 4) # Cut into quartiles
cats

pd.value_counts(cats)

pd.qcut(data, [0, 0.1, 0.5, 0.9, 1.])


###检测和过滤异常值
#1
np.random.seed(12345)
data = DataFrame(np.random.randn(1000, 4))
data.describe()

col = data[3]
col[np.abs(col) > 3]

data[(np.abs(data) > 3).any(1)]

#2
data[np.abs(data) > 3] = np.sign(data) * 3
data.describe()


###排列与随机采样
#1
df = DataFrame(np.arange(5 * 4).reshape((5, 4)))
sampler = np.random.permutation(5)
Example #16
0
df5["sum_col"] = df5.apply(sum_two_cols, axis=1)

print(df5)

import math


def int_float_squares(series):
    return pd.Series({"int_sq": series["int_col"] ** 2, "flt_sq": series["float_col"] ** 2})


print(df.apply(int_float_squares, axis=1))

### 7. Basic Stats ###

print(df.describe())
print(df.cov())
print(df.corr())

### 8. Merge and Join ###

print(df)
other = DataFrame({"str_col": ["a", "b"], "some_val": [1, 2]})
print(other)
print(pd.merge(df, other, on="str_col", how="inner"))
print(pd.merge(df, other, on="str_col", how="outer"))
print(pd.merge(df, other, on="str_col", how="left"))
print(pd.merge(df, other, on="str_col", how="right"))

### 9. Plot ###
pyplot.show()


# fit model
model = ARIMA(series, order=(5,1,0))
model_fit = model.fit(disp=0)
print(model_fit.summary())

# plot residual errors
residuals = DataFrame(model_fit.resid)
residuals.plot()
pyplot.show()

residuals.plot(kind='kde')
pyplot.show()
print(residuals.describe())

# http://www.statsmodels.org/devel/generated/statsmodels.tsa.arima_model.ARIMA.predict.html

X = series.values
size = int(len(X) * 0.66)
train, test = X[0:size], X[size:len(X)]
history = [x for x in train]
predictions = list()
for t in range(len(test)):
	model = ARIMA(history, order=(5,1,0))
	model_fit = model.fit(disp=0)
	output = model_fit.forecast()
	yhat = output[0]
	predictions.append(yhat)
	obs = test[t]
p=DataFrame(predicted_probs)


# In[186]:

p.shape


# In[187]:

p.head(2)


# In[188]:

p.describe()


# In[189]:

get_ipython().magic(u'pinfo lr.predict_proba')


# In[190]:

p1=p[0]
p2=p[1]


# In[192]:
Example #19
0
"""
Student: Max Sorto
Class: IT5090G - Aasheim
Date: 03/31/2016
Assignment: Lab10
"""

from pandas import DataFrame

data = {'name':['Joe','John','Mary','Lee'],
	'quiz 1':[100,87,99,78],
	'quiz 2':[45,78,90,88],
	'assign 1':[98,82,93,78],
	'assign 2':[100,87,99,78]
}

frame = DataFrame(data)
print frame
print '\n'
print frame.describe()
def create_fip(year = None):
    assert year is not None
    # fip : fichier d'imposition des personnes
    """
    Creates a 'fipDat' table containing all these 'fip individuals'
    """
    # Some individuals are declared as 'personne à charge' (pac) on 'tax forms'
    # but are not present in the erf or eec tables.
    # We add them to ensure consistency between concepts.

    temporary_store = TemporaryStore.create(file_name = "erfs")

    replace = create_replace(year)

    erfs_survey_collection = SurveyCollection.load(
        collection = 'erfs', config_files_directory = config_files_directory)
    survey = erfs_survey_collection.get_survey('erfs_{}'.format(year))

    log.info(u"Démarrage de 03_fip")

    # anaisenf is a string containing letter code of pac (F,G,H,I,J,N,R) and year of birth (example: 'F1990H1992')
    # when a child is invalid, he appears twice in anaisenf (example: F1900G1900 is a single invalid child born in 1990)
    erfFoyVar = ['declar', 'anaisenf']
    foyer = survey.get_values(table = replace["foyer"], variables = erfFoyVar)
    foyer.replace({'anaisenf': {'NA': np.nan}}, inplace = True)

    log.info(u"Etape 1 : on récupere les personnes à charge des foyers")
    log.info(u"    1.1 : Création des codes des enfants")
    foyer['anaisenf'] = foyer['anaisenf'].astype('string')
    nb_pac_max = len(max(foyer['anaisenf'], key=len)) / 5
    log.info(u"il ya a au maximum {} pac par foyer".format(nb_pac_max))

    # Separating the string coding the pac of each "déclaration".
    # Creating a list containing the new variables.

    # Creating the multi_index for the columns
    multi_index_columns = []
    assert int(nb_pac_max) == nb_pac_max, "nb_pac_max = {} which is not an integer".format(nb_pac_max)
    nb_pac_max = int(nb_pac_max)
    for i in range(1, nb_pac_max + 1):
        pac_tuples_list = [
            (i, 'declaration'),
            (i, 'type_pac'),
            (i, 'naia')
            ]
        multi_index_columns += pac_tuples_list

    columns = MultiIndex.from_tuples(
        multi_index_columns,
        names = ['pac_number', 'variable']
        )
    fip = DataFrame(np.random.randn(len(foyer), 3 * nb_pac_max), columns = columns)
    log.info("{}".format(fip.describe()))
    log.info("{}".format(fip.info()))

    for i in range(1, nb_pac_max + 1):  # TODO: using values to deal with mismatching indexes
        fip[(i, 'declaration')] = foyer['declar'].values
        fip[(i, 'type_pac')] = foyer['anaisenf'].str[5 * (i - 1)].values
        fip[(i, 'naia')] = foyer['anaisenf'].str[5 * (i - 1) + 1: 5 * i].values

    fip = fip.stack("pac_number")
    fip.reset_index(inplace = True)
    fip.drop(['level_0'], axis = 1, inplace = True)

    log.info(u"    1.2 : elimination des foyers fiscaux sans pac")
    # Clearing missing values and changing data format
    fip = fip[(fip.type_pac.notnull()) & (fip.naia != 'an') & (fip.naia != '')].copy()
    fip = fip.sort(columns = ['declaration', 'naia', 'type_pac'])
    # TODO: check if useful
    fip.set_index(["declaration", "pac_number"], inplace = True)
    fip = fip.reset_index()
    fip.drop(['pac_number'], axis = 1, inplace = True)
    # TODO: rajouter la case I : "Dont enfants titulaires de la carte d’invalidité"
    assert fip.type_pac.isin(["F", "G", "H", "I", "J", "N", "R"]).all(), "Certains type de PAC sont inconnus"
    # TODO: find a more explicit message

#    control(fip, debug=True, verbose=True, verbose_columns=['naia'])

    log.info(u"    1.3 : on enlève les individus F pour lesquels il existe un individu G")
    type_FG = fip[fip.type_pac.isin(['F', 'G'])].copy()  # Filtre pour ne travailler que sur F & G

    type_FG['same_pair'] = type_FG.duplicated(subset = ['declaration', 'naia'], take_last = True)
    type_FG['is_twin'] = type_FG.duplicated(subset = ['declaration', 'naia', 'type_pac'])
    type_FG['to_keep'] = ~(type_FG['same_pair']) | type_FG['is_twin']
    # Note : On conserve ceux qui ont des couples déclar/naia différents et les jumeaux
    #       puis on retire les autres (à la fois F et G)
    log.info(u"longueur fip {}".format(len(fip)))

    fip['to_keep'] = np.nan
    fip.update(type_FG)

    log.info(u"    1.4 : on enlève les H pour lesquels il y a un I")
    type_HI = fip[fip.type_pac.isin(['H', 'I'])].copy()
    type_HI['same_pair'] = type_HI.duplicated(subset = ['declaration', 'naia'], take_last = True)
    type_HI['is_twin'] = type_HI.duplicated(subset = ['declaration', 'naia', 'type_pac'])
    type_HI['to_keep'] = (~(type_HI['same_pair']) | (type_HI['is_twin'])).values

    fip.update(type_HI)
    fip['to_keep'] = fip['to_keep'].fillna(True)
    log.info(u"nb lines to keep = {} / nb initial lines {}".format(len(fip[fip['to_keep']]), len(fip)))

    indivifip = fip[fip['to_keep']].copy()
    del indivifip['to_keep'], fip, type_FG, type_HI
    #
    # control(indivifip, debug=True)

    log.info(u"Step 2 : matching indivifip with eec file")
    indivi = temporary_store['indivim_{}'.format(year)]
    pac = indivi[(indivi.persfip.notnull()) & (indivi.persfip == 'pac')].copy()
    assert indivifip.naia.notnull().all(), "Il y a des valeurs manquantes de la variable naia"

    pac['naia'] = pac.naia.astype('int32')  # TODO: was float in pac fix upstream
    indivifip['naia'] = indivifip.naia.astype('int32')
    pac['key1'] = zip(pac.naia, pac['declar1'].str[:29])
    pac['key2'] = zip(pac.naia, pac['declar2'].str[:29])
    indivifip['key'] = zip(indivifip.naia.values, indivifip['declaration'].str[:29].values)
    assert pac.naia.dtype == indivifip.naia.dtype, \
        "Les dtypes de pac.naia {} et indvifip.naia {} sont différents".format(pac.naia.dtype, indivifip.naia.dtype)

    fip = indivifip[~(indivifip.key.isin(pac.key1.values))].copy()
    fip = fip[~(fip.key.isin(pac.key2.values))].copy()

    log.info(u"    2.1 new fip created")
#   We build a dataframe to link the pac to their type and noindiv
    tmp_pac1 = pac[['noindiv', 'key1']].copy()
    tmp_pac2 = pac[['noindiv', 'key2']].copy()
    tmp_indivifip = indivifip[['key', 'type_pac', 'naia']].copy()

    pac_ind1 = tmp_pac1.merge(tmp_indivifip, left_on='key1', right_on='key', how='inner')
    log.info(u"longueur pacInd1 {}".format(len(pac_ind1)))
    pac_ind2 = tmp_pac2.merge(tmp_indivifip, left_on='key2', right_on='key', how='inner')
    log.info(u"longueur pacInd2 {}".format(len(pac_ind2)))
    log.info(u"pacInd1 & pacInd2 créés")

    log.info("{}".format(pac_ind1.duplicated().sum()))
    log.info("{}".format(pac_ind2.duplicated().sum()))

    del pac_ind1['key1'], pac_ind2['key2']

    if len(pac_ind1.index) == 0:
        if len(pac_ind2.index) == 0:
                log.info(u"Warning : no link between pac and noindiv for both pacInd1&2")
        else:
            log.info(u"Warning : pacInd1 is an empty data frame")
            pacInd = pac_ind2
    elif len(pac_ind2.index) == 0:
        log.info(u"Warning : pacInd2 is an empty data frame")
        pacInd = pac_ind1
    else:
        pacInd = concat([pac_ind2, pac_ind1])
    log.info("{}{}{}".format(len(pac_ind1), len(pac_ind2), len(pacInd)))
    log.info("{}".format(pac_ind2.type_pac.isnull().sum()))
    log.info("{}".format(pacInd.type_pac.value_counts()))

    log.info(u"    2.2 : pacInd created")

    log.info(u"doublons noindiv, type_pac {}".format(pacInd.duplicated(['noindiv', 'type_pac']).sum()))
    log.info(u"doublons noindiv seulement {}".format(pacInd.duplicated('noindiv').sum()))
    log.info(u"nb de NaN {}".format(pacInd.type_pac.isnull().sum()))

    del pacInd["key"]
    pacIndiv = pacInd[~(pacInd.duplicated('noindiv'))].copy()
    # pacIndiv.reset_index(inplace=True)
    log.info("{}".format(pacIndiv.columns))

    temporary_store['pacIndiv_{}'.format(year)] = pacIndiv

    log.info("{}".format(pacIndiv.type_pac.value_counts()))
    gc.collect()

# # We keep the fip in the menage of their parents because it is used in to
# # build the famille. We should build an individual ident (ménage) for the fip that are
# # older than 18 since they are not in their parents' menage according to the eec

# individec1 <- subset(indivi, (declar1 %in% fip$declar) & (persfip=="vous"))
# individec1 <- individec1[,c("declar1","noidec","ident","rga","ztsai","ztsao")]
# individec1 <- upData(individec1,rename=c(declar1="declar"))
# fip1       <- merge(fip,individec1)
# indivi$noidec <- as.numeric(substr(indivi$declar1,1,2))
    log.info("{}".format(indivi['declar1'].str[0:2].value_counts()))
    log.info("{}".format(indivi['declar1'].str[0:2].describe()))
    log.info("{}".format(indivi['declar1'].str[0:2].notnull().all()))
    log.info("{}".format(indivi.info()))
    selection = indivi['declar1'].str[0:2] != ""
    indivi['noidec'] = indivi.declar1[selection].str[0:2].astype('int32')  # To be used later to set idfoy

    individec1 = indivi[(indivi.declar1.isin(fip.declaration.values)) & (indivi.persfip == "vous")]
    individec1 = individec1[["declar1", "noidec", "ident", "rga", "ztsai", "ztsao"]].copy()
    individec1 = individec1.rename(columns = {'declar1': 'declaration'})
    fip1 = fip.merge(individec1, on = 'declaration')
    log.info(u"    2.3 : fip1 created")

# # TODO: On ne s'occupe pas des declar2 pour l'instant
# # individec2 <- subset(indivi, (declar2 %in% fip$declar) & (persfip=="vous"))
# # individec2 <- individec2[,c("declar2","noidec","ident","rga","ztsai","ztsao")]
# # individec2 <- upData(individec2,rename=c(declar2="declar"))
# # fip2 <-merge(fip,individec2)

    individec2 = indivi[(indivi.declar2.isin(fip.declaration.values)) & (indivi['persfip'] == "vous")]
    individec2 = individec2[["declar2", "noidec", "ident", "rga", "ztsai", "ztsao"]].copy()
    individec2.rename(columns = {'declar2': 'declaration'}, inplace = True)
    fip2 = fip.merge(individec2)
    log.info(u"    2.4 : fip2 created")

    fip1.duplicated().value_counts()
    fip2.duplicated().value_counts()

    fip = concat([fip1, fip2])

    fip['persfip'] = 'pac'
    fip['year'] = year
    fip['year'] = fip['year'].astype('float')  # BUG; pas de colonne année dans la DF
    fip['noi'] = 99
    fip['noicon'] = None
    fip['noindiv'] = fip['declaration']
    fip['noiper'] = None
    fip['noimer'] = None
    fip['declar1'] = fip['declaration']  # TODO: declar ?
    fip['naim'] = 99
    fip['lien'] = None
    fip['quelfic'] = 'FIP'
    fip['acteu'] = None
    fip['agepf'] = fip['year'] - fip.naia.astype('float')
    fip['lpr'] = (fip['agepf'] <= 20) * 3 + (fip['agepf'] > 20) * 4
    fip['stc'] = None
    fip['contra'] = None
    fip['titc'] = None
    fip['mrec'] = None
    fip['forter'] = None
    fip['rstg'] = None
    fip['retrai'] = None
    fip['cohab'] = None
    fip['sexe'] = None
    fip['persfip'] = "pac"
    fip['agepr'] = None
    fip['actrec'] = (fip['agepf'] <= 15) * 9 + (fip['agepf'] > 15) * 5

## TODO: probleme actrec des enfants fip entre 16 et 20 ans : on ne sait pas s'ils sont étudiants ou salariés */
## TODO problème avec les mois des enfants FIP : voir si on ne peut pas remonter à ces valeurs: Alexis : clairement non

# Reassigning noi for fip children if they are more than one per foyer fiscal
# while ( any(duplicated( fip[,c("noi","ident")]) ) ) {
#   dup <- duplicated( fip[, c("noi","ident")])
#   tmp <- fip[dup,"noi"]
#   fip[dup, "noi"] <- (tmp-1)
# }
    # TODO: Le vecteur dup est-il correct
    fip["noi"] = fip["noi"].astype("int64")
    fip["ident"] = fip["ident"].astype("int64")

    fip_tmp = fip[['noi', 'ident']]

    while any(fip.duplicated(cols=['noi', 'ident'])):
        fip_tmp = fip.loc[:, ['noi', 'ident']]
        dup = fip_tmp.duplicated()
        tmp = fip.loc[dup, 'noi']
        log.info("{}".format(len(tmp)))
        fip.loc[dup, 'noi'] = tmp.astype('int64') - 1

    fip['idfoy'] = 100 * fip['ident'] + fip['noidec']
    fip['noindiv'] = 100 * fip['ident'] + fip['noi']
    fip['type_pac'] = 0
    fip['key'] = 0

    log.info("{}".format(fip.duplicated('noindiv').value_counts()))
    temporary_store['fipDat_{}'.format(year)] = fip
    del fip, fip1, individec1, indivifip, indivi, pac
    log.info(u"fip sauvegardé")
print dict_of_lists,'\n'
        
#convert the list of values for each key in dictionary to Series
countries = Series(dict_of_lists['countries'])
gold = Series(dict_of_lists['gold'])
silver = Series(dict_of_lists['silver'])
bronze= Series(dict_of_lists['bronze'])
       
#construct a dictionary of Series' that can be turned in to a DataFrame
medal_tally_dict = {'countries' : countries, 'gold': gold, 'silver': silver, 'bronze': bronze}
df = DataFrame(medal_tally_dict)
print df,'\n'

#DataFrame and Series properties
print df[['countries','gold']],'\n'
print df.describe(),'\n'
print 'The gold series in the dataframe is of dtype : ',df['gold'].dtype 
print 'Number countries : ',len(df['countries'])  
print 'The mean of golds won where bronze medals greater than 5 : ',df['gold'][df['bronze']>=2].mean()
print 'Mean of gold and bronze medal counts : ',(df['gold']+df['silver']).mean()
print 'Mean of golds : ',df['gold'].mean()
print 'Mean of bronzes : ',df['bronze'].mean()
print 'Max number of golds won by a country : ',df['gold'].max()
print 'Sum of golds won by all countries : ',df['gold'].sum()

# WIP
def standardize_data(values):
    standardized_values = (values - values.mean() ) / values.std()
    print standardized_values,'\n'

with open('C:\omnica\data\managers1.csv','w') as csvfile:
df.idxmax()  # 计算每一列最大值的索引
'''
one    b
two    b
'''
print
df.cumsum()  # 每一列的累加和
'''
   one  two
a  1.0  NaN
b  8.0  4.0
c  NaN  NaN
d  8.0  5.0
'''
print
df.describe()  # 对DataFrame每列计算汇总统计
'''
            one      two
count  3.000000  2.00000
mean   2.666667  2.50000
std    3.785939  2.12132
min    0.000000  1.00000
25%         NaN      NaN
50%         NaN      NaN
75%         NaN      NaN
max    7.000000  4.00000
'''
obj = Series([2, 4, 8, 4], index=['a', 'a', 'b', 'c'])
print
obj.describe()  # 对Series计算汇总统计
'''
Example #23
0
# como descendente.
equipos.sort_index(ascending=1)
equipos.sort_index(ascending=0)


# d) Seleccionando la columna 'socios', mostrar el resultado de ordenar por
# valores.
equipos.sort_values(by='socios')


##
# Ejercicio 4
##
# a) Seguiremos trabajando con el DataFrame creado en el ejercicio 1. Mostrar
# un resumen de la información del mismo.
equipos.describe()

# b) Para aumentar la información de nuestros datos, concatenar a nuestro
# DataFrame un objeto DataFrame con la siguiente información:
new_data = {'equipo': ['Atletico de Madrid'],
            'titulos': [29],
            'socios': [48008]}

equipos = equipos.append(new_data,ignore_index=True)


# c) Crear una nueva columna 'posicion' con los siguientes datos:
posicion_values = ['13', np.nan, '3', np.nan, '5', np.nan]

equipos['posicion'] = posicion_values
Example #24
0
empDf.append(Series([5,False,'Derek',2],
                    index=['id','isManager','name','deptId'],
ignore_index=True)
empDf

#Deleting a column
empDf['dummy']=1
empDf
del empDf['dummy']
empDf

#Deleting a row
empDf.sort_index(axis=1)
empDf.sort(['isManager','name'])

empDf.describe()
empDf.id.corr(empDf.deptId)

#Iterate through a data frame
for rowNum, row in auto_data.iterrows():
    for colName, col in row.iteritems():
        if pd.isnull(col):
            print(pd.isnull(col),rowNum,colName)







Example #25
0
df
df = DataFrame([[1.4, np.nan], [7.1, -4.5],
[np.nan, np.nan], [0.75, -1.3]],
index=['a', 'b', 'c', 'd'],
columns=['one', 'two'])
df
df.sum() # columns sum
df.sum(axis=1) # sum row by row
df
(7.10 - 4.5)/2
df.mean(axis=1, skipna=False)
df
df.idxmax()
df
df.cumsum() # accumultation
df.describe() # multiple summary statistics in one shot.
obj = Series(['a', 'a', 'b', 'c'] * 4)
obj
obj.describe()
## Correlation and Covariance
import pandas.io.data as web
all_data = {}
for ticker in ['AAPL', 'IBM', 'MSFT', 'GOOG']:
    all_data[ticker] = web.get_data_yahoo(ticker, '1/1/2000', '1/1/2010')
    
price = DataFrame({tic: data['Adj Close'],
for tic, data in all_data.iteritems()})
price = DataFrame({tic: data['Adj Close'] 
for tic, data in all_data.iteritems()})
price
volume = DataFrame({tic: data['Volume'] 
Example #26
0
# データフレームを作る
smp = {'state': ['Ohio', 'Ohio', 'Ohio', 'Nebada', 'Nebada'],
       'year': [2000, 2001, 2002, 2001, 2002],
       'pop': [1.5, 1.6, 1.7, 3.5, 4.3]
       }
frame = DataFrame(smp)

# データフレームの要素へのアクセス
frame.year  # frame$year
frame['year']  # frame$year
frame.head()  # head
frame.tail()  # tail
frame2 = DataFrame(
    smp, index=['one', 'two', 'three', 'four', 'five'])  # インデックスを追加
frame2.ix['one']
frame2.describe()  # summary
print(frame2.describe())

# データを読み込む
data = pd.read_csv('stock_px.csv')
print(data)
xlsx_file = pd.ExcelFile('stock_px.xlsx')  # openpyxlのインストールが必要, xlsも可
xlsx_file.sheet_names
data = xlsx_file.parse('stock_px')
print(data)

# web上のデータを読み込む→http://docs.scipy.org/doc/numpy/reference/generated/numpy.DataSource.html
ds = np.DataSource(None)
f = ds.open('https://dl.dropbox.com/u/956851/game_modified.csv')
d_web = pd.read_csv(f)
print(d_web)
Example #27
0
import numpy as np
randn = np.random.randn
import pandas as pd
from pandas import Series, DataFrame

np.random.seed(12345)

data = DataFrame(np.random.randn(1000, 4))
print data.describe()

col = data[3]
print col[np.abs(col) > 3]
print data[(np.abs(data) > 3).any(1)]  #To select all rows having a value exceeding 3 or -3, you can use the any method on a boolean DataFrame:
data[np.abs(data) > 3] = np.sign(data) * 3
print data.describe()
def main():
    """
    Calculation and aggregation of summary statistics
    """

    # Summary of statistics
    # return is not ndarray
    df = DataFrame([[1.4, np.nan],
                    [7.1, -4.5],
                    [np.nan, np.nan],
                    [0.75, -1.3]],
                   index=list('abcd'),
                   columns=['one', 'two'])
    print df
    print df.sum()
    print df.sum(axis=1)
    print df.mean(axis=1) # exclude nan
    print df.mean(axis=1, skipna=False)
    print df.idxmin()
    print df.idxmax()
    print df.cumsum()
    print df.describe()
    # values are not number
    obj = Series(list('aabc') * 4)
    print obj.describe()


    methods = ['count', 'min', 'max', # 'argmin', 'argmax',
               'quantile', 'median', 'mad', 'var', 'std',
               'skew', 'kurt', 'cummin', 'cummax', 'cumprod',
               'diff', 'pct_change']

    for method in methods:
        print u'「{0}」'.format(method)
        print getattr(df, method)()
        print ''

    # Correspond and Covariance
    all_data = {}
    lst = [] # ['AAPL', 'IBM', 'MSFT'] #, 'GOOG']:
    for ticket in lst: #, 'GOOG']:
        # IOError: after 3 tries, Yahoo! did not return a 200
        # for url 'http://ichart.finance.yahoo.com/table.csv?s=GOOG&a=0&b=1&c=2000&d=0&e=1&f=2010&g=d&ignore=.csv'
        all_data[ticket] = pd.io.data.get_data_yahoo(ticket, '1/1/2000', '1/1/2010')
    price = DataFrame({tic: data['Adj Close'] for tic, data in all_data.iteritems()})
    volume = DataFrame({tic: data['Volume'] for tic, data in all_data.iteritems()})
    if all_data:
        returns = price.pct_change()
        print returns.tail()
        print ''
        print returns.MSFT.corr(returns.IBM)
        print returns.MSFT.cov(returns.IBM)
        print ''
        print returns.corr()
        print returns.cov()
        print ''
        print returns.corrwith(returns.IBM)
        print returns.corrwith(volume)

    # unique, frequency, belong
    print '',''
    obj = Series(list('cadaabbcc'))
    uniques = obj.unique()
    print uniques
    print obj.value_counts()
    print pd.value_counts(obj.values, sort=False)
    mask = obj.isin(['b', 'c'])
    print mask
    print obj[mask]

    data = DataFrame({
        'Qu1' : [1,3,4,3,4],
        'Qu2' : [2,3,1,2,3],
        'Qu3' : [1,5,2,4,4],
    })
    print data
    print data.apply(pd.value_counts).fillna(0)
Example #29
0
obj['a']

## 汇总和计算描述统计
df = DataFrame([[1.4,np.nan],[7.1,-4.5],[np.nan,np.nan],[0.75,-1.3]],
	index=['a','b','c','d'], columns=['one','two'])
# 对列
df.sum()
# 对行
df.sum(axis=1)
# 默认会排除NA,但是可以通过skipna禁用该功能
df.mean(axis=1,skipna=False)
# 返回最大值的索引
df.idxmax()
# 累加
df.cumsum()
df.describe()
# 相关系数
returns.MSFT.corr(returns.IBM)
returns.corr()
returns.cov()
returns.corrwith(returns.IBM)

## 唯一值,值计数以及成员资格
obj = Series(['c','a','d','a','a','b','b','c','c'])
uniques = obj.unique()
# 统计个数
obj.value_counts()
# 统计个数后默认排序,也可以不排序
pd.value_counts(obj.values, sort=False)
# 判断是否存在
mask = obj.isin(['b','c'])
df.applymap(f)
print df
print "*"*15

print "Definimos de nuevo el dataframe"
df = pd.DataFrame(data={"A":[1,2], "B":[2.6,1.3]})
print df
print "añadimos columnas combinando las actuales"
df["C"] = df["A"]+df["B"]
df["D"] = df["A"]*3
df["E"] = np.sqrt(df["A"])
print df
print "*"*15
print "Datos disponibles de un dataframe"
print " descripcion del dataframe"
print df.describe()
print " covarianza "
print df.cov()
print " correlación "
print df.corr()
print "*"*15

print " Creamos otro dataframe con valores aleatorios (1000 filas y 2 columnas "
print " DataFrame(np.random.randn(1000,2),columns=['x','y'])"
plot_df = DataFrame(np.random.randn(1000,2),columns=['x','y'])
print plot_df
print "Mostramos las graficas"
plot_df.plot()
plot_df.hist()