def testPandasGroupbyFilter(self):
        import pandas as pd

        data = [
            [2001, 1],
            [2002, 2],
            [2003, 3]
        ]
        df = DataFrame(pd.DataFrame(data, columns=['id', 'fid']))

        df2 = df.groupby('id').agg(df.fid.sum())
        df3 = df2[df2.id == 2003]

        expected = [
            [2003, 3]
        ]

        self.assertEqual(df3.execute().values.values.tolist(), expected)

        df2 = df.groupby('id').agg(df.fid.sum())
        df2.execute()
        self.assertIsNotNone(df2._cache_data)
        df3 = df2[df2.id == 2003]

        self.assertEqual(df3.execute().values.values.tolist(), expected)
        self.assertEqual(df3.execute().values.values.tolist(), expected)

        df4 = df.fid.sum()
        self.assertEqual(df4.execute(), 6)
        self.assertEqual(df4.execute(), 6)
Exemple #2
0
    def testPandasGroupbyFilter(self):
        import pandas as pd

        data = [
            [2001, 1],
            [2002, 2],
            [2003, 3]
        ]
        df = DataFrame(pd.DataFrame(data, columns=['id', 'fid']))

        df2 = df.groupby('id').agg(df.fid.sum())
        df3 = df2[df2.id == 2003]

        expected = [
            [2003, 3]
        ]

        self.assertEqual(df3.execute().values.values.tolist(), expected)

        df2 = df.groupby('id').agg(df.fid.sum())
        df2.execute()
        self.assertTrue(context.is_cached(df2))
        df3 = df2[df2.id == 2003]

        self.assertEqual(df3.execute().values.values.tolist(), expected)
        self.assertEqual(df3.execute().values.values.tolist(), expected)

        df4 = df.fid.sum()
        self.assertEqual(df4.execute(), 6)
        self.assertEqual(df4.execute(), 6)
 def test_df_store(self):
     self.delete_table(IONOSPHERE_SORTED_TABLE_PART)
     self.create_ionosphere_two_parts(IONOSPHERE_TABLE_TWO_PARTS)
     df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE_TWO_PARTS)).filter_partition('part1=1,part2=2')
     self.odps.delete_table(IONOSPHERE_SORTED_TABLE_PART)
     sorted_df = df.groupby(df['class']).agg(df.a01.count().rename('count')).sort('class', ascending=False)
     sorted_df.persist(IONOSPHERE_SORTED_TABLE_PART)
    def testRepeatSetItem(self):
        df = DataFrame(self.table)

        df['rank'] = df.groupby('name').sort('id').id.rank()
        df['rank'] = df.groupby('name').sort('id').id.rank()

        self.assertEqual(len(df.execute()), 3)
 def test_df_store(self):
     self.delete_table(IONOSPHERE_SORTED_TABLE_PART)
     self.create_ionosphere_two_parts(IONOSPHERE_TABLE_TWO_PARTS)
     df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE_TWO_PARTS)).filter_partition('part1=1,part2=2')
     drop_table(self.odps, IONOSPHERE_SORTED_TABLE_PART, async=False)
     sorted_df = df.groupby(df['class']).agg(df.a01.count().rename('count')).sort('class', ascending=False)
     sorted_df.persist(IONOSPHERE_SORTED_TABLE_PART)
Exemple #6
0
def chengbenjiaupdatedf(dfsall, cnxxc):
    """
    :param dfsall: 按照日期排序的销售明细记录
    :param cnxxc: 数据库连接,为了查询生成产品价格变动记录
    :return:
    """

    # 读取进货记录(排除退货记录)
    dfpros = pd.read_sql_query(
        'select 产品名称, strftime(\'%Y%m\',日期) as 年月, 金额 as 进货金额, 数量 as 进货数量, '
        '单价 as 进货单价 from jinghuomingxi where 金额 >=0 order by 年月, 产品名称', cnxxc)
    dfpros = DataFrame(dfpros)
    descdb(dfpros[dfpros.进货金额 == 0].to_pandas())

    # 按照月份汇总,生成成本单价并按照月份分组汇总,生成价格调整记录
    dfpro = dfpros.groupby(['产品名称', '年月']).agg(进货金额=dfpros.进货金额.sum(),
                                               进货数量=dfpros.进货数量.sum())
    dfpro = dfpro[dfpro, (dfpro.进货金额 / dfpro.进货数量).round(2).rename('单价')]
    descdb(dfpro.to_pandas())
    dfpro = dfpro.groupby(['产品名称', '单价'
                           ]).agg(年月=dfpro.年月.min(),
                                  进货金额=dfpro.进货金额.sum()).sort(['产品名称', '年月'])
    descdb(dfpro.to_pandas())
    log.info('共有%d条产品价格记录,共有%d条产品价格记录(含调价)' % (dfpro.groupby('产品名称').agg(
        dfpro.单价.count()).to_pandas().shape[0], dfpro.to_pandas().shape[0]))

    log.info('共有%d条销售明细记录' % dfsall.shape[0])
    dfsall['年月'] = dfsall['日期'].apply(
        lambda x: datetime.datetime.strftime(x, '%Y%m'))
    dfprosall = dfsall.groupby('商品全名', as_index=False)['金额'].sum()
    dfprosall.rename(columns={'商品全名': '产品名称', '金额': '销售金额'}, inplace=True)

    # 连接进货产品目录和销售产品目录,查看各自的空记录
    dfproall = pd.merge(dfpro.groupby(
        ['产品名称']).agg(进货金额=dfpro.进货金额.sum()).to_pandas(),
                        dfprosall,
                        how='outer')
    descdb(dfproall)
    log.info('以下进货产品在本期无销售记录:%s' %
             list(dfproall[dfproall.销售金额.isnull().values == True]['产品名称']))

    dfsall['成本单价'] = 0
    dfpro = dfpro.to_pandas()
    for idx in dfpro.index:
        dfsall.loc[dfsall[(dfsall.商品全名 == dfpro.loc[idx]['产品名称'])
                          & (dfsall.年月 >= dfpro.loc[idx]['年月'])].index,
                   ['成本单价']] = dfpro.loc[idx]['单价']

    dfsall['成本金额'] = dfsall['成本单价'] * dfsall['数量']
    dfsall['毛利'] = dfsall['金额'] - dfsall['成本金额']
    descdb(dfsall)
    del dfsall['年月']  # 删除过程数据
    descdb(dfsall)

    return dfsall
 def test_df_method(self):
     self.create_ionosphere(IONOSPHERE_TABLE)
     df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE))
     sorted_df = df.groupby(df['class']).agg(df.a01.count().rename('count')).sort('class', ascending=False)
     sorted_df.to_pandas()
Exemple #8
0
# -*- coding: utf-8 -*-
"""
Created on Sat Sep  2 20:07:27 2017

@author: shuai.qian
"""
import matplotlib.pyplot as plt
from odps.df import DataFrame
from odps import ODPS

o = ODPS('',project='', endpoint='')
t = DataFrame(o.get_table('tmp_ods_mc_testing_dlt'))
print("=================================> START <==================================")
#print(t.dtypes)
#print(t["class"].head(5))
t.groupby('class').agg(count = t['class'].count())

# %matplotlib inline

t['class'].value_counts().plot(kind = 'bar', x = 'class', xlabel = 'cnt' )

tmp = range(0,10,2)
tmp.pop(1)
 def test_df_method(self):
     self.create_ionosphere(IONOSPHERE_TABLE)
     df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE))
     sorted_df = df.groupby(df['class']).agg(df.a01.count().rename('count')).sort('class', ascending=False)
     sorted_df.to_pandas()