Exemple #1
0
def test_DaPy(files):
    # Testing of DaPy
    t1 = clock()
    data_dapy = dp.read(files)
    t2 = clock()
    data_dapy.toframe()
    data_dapy = data_dapy.data
    t2_ = clock()
    for line in data_dapy:
        this_line = line
    t3 = clock()
    data_dapy.sort(('Price', 'ASC'))
    t4 = clock()
    dp.save('test_DaPy.csv', data_dapy)
    t5 = clock()
    return t2 - t1, t3 - t2_, t4 - t3, t5 - t4, t5 - t1
 def _forward(self, S, E, I, D, C, param, max_iter):
     a1, a2, b, s, g=param
     est = dp.Table(columns=['S','E','I','D','C'])
     for t in range(max_iter):
         S_ = S - a1 * E-a2 * I+s * I
         E_ = E  + a1 * E+ a2 * I-b *E
         I_ = I + b * E - s * I - g * I
         D_ = D+ g * I
         C_ = C +s * I
         S, E, I, D, C = S_, E_, I_, D_, C_
         est.append_row([S, E, I, D, C])
     return est 
Exemple #3
0
def test_load(files):
    gc.disable()

    # Testing of Pandas
    t1 = time.time()

    data_pandas = pd.DataFrame(pd.read_csv(files))

    t2 = time.time()

    for index in data_pandas.index:
        this_line = data_pandas.loc[index].values[:]

    t3 = time.time()

    # Tesing of Datapy
    data_DaPy = dp.DataSet(files)
    data_DaPy.readframe()

    t4 = time.time()

    for item in data_DaPy:
        this_line = item

    t5 = time.time()

    # Tesing of Numpy
    data_numpy = np.genfromtxt(files, dtype=None, delimiter=',', names=True)

    t6 = time.time()

    for line in data_numpy:
        this_line = line

    t7 = time.time()

    # File Information
    with open(files, 'r') as f:
        data_file = f.read()

    # Calculating the memory of each data set
    size_pandas = sys.getsizeof(data_pandas)
    size_DaPy = sys.getsizeof(data_DaPy.data)
    size_numpy = sys.getsizeof(data_numpy)
    size_file = sys.getsizeof(data_file)

    gc.enable()
    gc.collect()

    print '\n'
    print '               DaPy | Pandas | Numpy | File'
    print '    Load Time:{0:^4.2f}\t| {1:^4.2f}\t| {2: ^4.2f}\t|  -'.format(
        t4 - t3, t2 - t1, t6 - t5)
    print 'Traverse Time:{0:^4.1f}\t| {1:^4.1f}\t| {2: ^4.1f}\t|  -'.format(
        t5 - t4, t3 - t2, t7 - t6)
    print '  Total Spent:{0:^4.1f}\t| {1:^4.1f}\t| {2: ^4.1f}\t|  -'.format(
        t5 - t3, t3 - t1, t7 - t5)
    print '  Memory Size: %dMB\t| %dMB\t| %dMB\t| %dMB' % (
        size_DaPy // 1048575, size_pandas // 1048575, size_numpy // 1048575,
        size_file // 1048575)
    print '\n'
Exemple #4
0
 def __init__(self, addr):
     threading.Thread.__init__(self)
     self._data = dp.read(addr)
     self.start()
#!/usr/bin/env python
# coding: utf-8

# In[43]:


#数据预处理
import numpy as np
import DaPy as dp
from random import randint, random


# In[45]:


data = dp.read('Updates_NC.csv').data

def decode(val, code='utf-8'): 
    if isinstance(val,bytes):
        return val.decode(code) 
    return val 
data = data.map(decode, cols=['报道时间', '省份', '城市', '消息来源'], inplace=True)


# In[ ]:


#用23日之前的到10日左右能找到的只有6天数据,尝试过但效果不佳,所以决定加入封城之后的数据;这里假设以16号作为初始;
#2月12日诊断方式发生了改变,数目激增,所以决定模型的训练数据从1月17日到2月11日,当然这段时间之内存在不同强度的措施,这里只是假设一样。
wuhan = data.query(' 城市 == "武汉市" and 报道时间 >"1月16日"').reverse()
wuhan = wuhan.groupby('报道时间', max, apply_col=['新增确诊', '新增出院', '新增死亡'])
Exemple #6
0
import numpy as np
import DaPy as dp
from timeit import Timer

X = [[17, 2, 9, 2], [21, 8, 1, 46], [4, 3, 2, 13], [23, 1, 31, 3]]

X_dp = dp.mat(X)
X_np = np.mat(X)


def numpy_multi():
    X_np * X_np
    X_np * 32


def dapy_multi():
    X_dp * X_dp
    X_dp * 32


def numpy_dot():
    X_np.T.dot(X_np)


def dapy_dot():
    X_dp.T.dot(X_dp)


def numpy_attribute():
    X_np.T
    X_np.I
Exemple #7
0
import DaPy as dp

for key in (True, False, 'other', 'self'):
    for same in (True, False):
        left = dp.SeriesSet(
            [['Alan', 35], ['Bob', 27], ['Charlie', 30], ['Daniel', 29]],
            ['Name', 'Age'])
        right = dp.SeriesSet([['Alan', 'M', 35], ['Bob', 'M', 27],
                              ['Charlie', 'F', 30], ['Janny', 'F', 26]],
                             ['Name', 'gender', 'Age'])

        print 'MERGE with keep_key=%s and keep_same=%s' % (key, same)
        left.merge(right, 'Name', 'Name', keep_key=key, keep_same=same)
        print left.show()
        print
data1 = dp.SeriesSet([['A', 39, 'F'], ['B', 40, 'F'], ['C', 38, 'M']],
                     ['Name', 'Age', 'Gender'])
data2 = dp.Frame([['A', 'F', True], ['B', 'F', False], ['C', 'M', True]],
                 ['Name', 'Gender', 'Married'])

data3 = [['A', 'China'], ['B', 'US'], ['C', 'Japan'], ['D', 'England']]
print dp.merge(
    data1,
    data2,
    data3,
    keys=0,
    keep_key='self',
    keep_same=False,
).show()
Exemple #8
0
import DaPy as dp
from DaPy.methods import LinearRegression as dp_lr

data = dp.read('advertising.csv')
lr_dp = dp_lr('numpy')
lr_dp.fit(data['TV':'newspaper'], data['sales'])
lr_dp.report.show()
Exemple #9
0
def main(files):
    dp_ = dp.Frame(None, ['Load', 'Traverse', 'Sort', 'Save', 'Total'])
    np_ = dp.Frame(None, ['Load', 'Traverse', 'Sort', 'Save', 'Total'])
    pd_ = dp.Frame(None, ['Load', 'Traverse', 'Sort', 'Save', 'Total'])
    for i in range(100):
        dp_.append(test_DaPy(files))
        np_.append(test_Numpy(files))
        pd_.append(test_Pandas(files))

    summary = dp.Frame(
        None,
        ['engine', 'Load', 'Traverse', 'Sort', 'Save', 'Total', 'Version'])
    summary.append([
        'DaPy',
        dp.mean(dp_['Load']),
        dp.mean(dp_['Traverse']),
        dp.mean(dp_['Sort']),
        dp.mean(dp_['Save']),
        dp.mean(dp_['Total']), dp.__version__
    ])
    summary.append([
        'Numpy',
        dp.mean(np_['Load']),
        dp.mean(np_['Traverse']),
        dp.mean(np_['Sort']),
        dp.mean(np_['Save']),
        dp.mean(np_['Total']), np.__version__
    ])
    summary.append([
        'Pandas',
        dp.mean(pd_['Load']),
        dp.mean(pd_['Traverse']),
        dp.mean(pd_['Sort']),
        dp.mean(pd_['Save']),
        dp.mean(pd_['Total']), pd.__version__
    ])

    file_ = dp.DataSet()
    file_.add(summary, 'Summary Table')
    file_.add(dp_, 'DaPy')
    file_.add(np_, 'Numpy')
    file_.add(pd_, 'Pandas')
    file_.save('Performance_result.xls')