def test_DaPy(files): # Testing of DaPy t1 = clock() data_dapy = dp.read(files) t2 = clock() data_dapy.toframe() data_dapy = data_dapy.data t2_ = clock() for line in data_dapy: this_line = line t3 = clock() data_dapy.sort(('Price', 'ASC')) t4 = clock() dp.save('test_DaPy.csv', data_dapy) t5 = clock() return t2 - t1, t3 - t2_, t4 - t3, t5 - t4, t5 - t1
def _forward(self, S, E, I, D, C, param, max_iter): a1, a2, b, s, g=param est = dp.Table(columns=['S','E','I','D','C']) for t in range(max_iter): S_ = S - a1 * E-a2 * I+s * I E_ = E + a1 * E+ a2 * I-b *E I_ = I + b * E - s * I - g * I D_ = D+ g * I C_ = C +s * I S, E, I, D, C = S_, E_, I_, D_, C_ est.append_row([S, E, I, D, C]) return est
def test_load(files): gc.disable() # Testing of Pandas t1 = time.time() data_pandas = pd.DataFrame(pd.read_csv(files)) t2 = time.time() for index in data_pandas.index: this_line = data_pandas.loc[index].values[:] t3 = time.time() # Tesing of Datapy data_DaPy = dp.DataSet(files) data_DaPy.readframe() t4 = time.time() for item in data_DaPy: this_line = item t5 = time.time() # Tesing of Numpy data_numpy = np.genfromtxt(files, dtype=None, delimiter=',', names=True) t6 = time.time() for line in data_numpy: this_line = line t7 = time.time() # File Information with open(files, 'r') as f: data_file = f.read() # Calculating the memory of each data set size_pandas = sys.getsizeof(data_pandas) size_DaPy = sys.getsizeof(data_DaPy.data) size_numpy = sys.getsizeof(data_numpy) size_file = sys.getsizeof(data_file) gc.enable() gc.collect() print '\n' print ' DaPy | Pandas | Numpy | File' print ' Load Time:{0:^4.2f}\t| {1:^4.2f}\t| {2: ^4.2f}\t| -'.format( t4 - t3, t2 - t1, t6 - t5) print 'Traverse Time:{0:^4.1f}\t| {1:^4.1f}\t| {2: ^4.1f}\t| -'.format( t5 - t4, t3 - t2, t7 - t6) print ' Total Spent:{0:^4.1f}\t| {1:^4.1f}\t| {2: ^4.1f}\t| -'.format( t5 - t3, t3 - t1, t7 - t5) print ' Memory Size: %dMB\t| %dMB\t| %dMB\t| %dMB' % ( size_DaPy // 1048575, size_pandas // 1048575, size_numpy // 1048575, size_file // 1048575) print '\n'
def __init__(self, addr): threading.Thread.__init__(self) self._data = dp.read(addr) self.start()
#!/usr/bin/env python # coding: utf-8 # In[43]: #数据预处理 import numpy as np import DaPy as dp from random import randint, random # In[45]: data = dp.read('Updates_NC.csv').data def decode(val, code='utf-8'): if isinstance(val,bytes): return val.decode(code) return val data = data.map(decode, cols=['报道时间', '省份', '城市', '消息来源'], inplace=True) # In[ ]: #用23日之前的到10日左右能找到的只有6天数据,尝试过但效果不佳,所以决定加入封城之后的数据;这里假设以16号作为初始; #2月12日诊断方式发生了改变,数目激增,所以决定模型的训练数据从1月17日到2月11日,当然这段时间之内存在不同强度的措施,这里只是假设一样。 wuhan = data.query(' 城市 == "武汉市" and 报道时间 >"1月16日"').reverse() wuhan = wuhan.groupby('报道时间', max, apply_col=['新增确诊', '新增出院', '新增死亡'])
import numpy as np import DaPy as dp from timeit import Timer X = [[17, 2, 9, 2], [21, 8, 1, 46], [4, 3, 2, 13], [23, 1, 31, 3]] X_dp = dp.mat(X) X_np = np.mat(X) def numpy_multi(): X_np * X_np X_np * 32 def dapy_multi(): X_dp * X_dp X_dp * 32 def numpy_dot(): X_np.T.dot(X_np) def dapy_dot(): X_dp.T.dot(X_dp) def numpy_attribute(): X_np.T X_np.I
import DaPy as dp for key in (True, False, 'other', 'self'): for same in (True, False): left = dp.SeriesSet( [['Alan', 35], ['Bob', 27], ['Charlie', 30], ['Daniel', 29]], ['Name', 'Age']) right = dp.SeriesSet([['Alan', 'M', 35], ['Bob', 'M', 27], ['Charlie', 'F', 30], ['Janny', 'F', 26]], ['Name', 'gender', 'Age']) print 'MERGE with keep_key=%s and keep_same=%s' % (key, same) left.merge(right, 'Name', 'Name', keep_key=key, keep_same=same) print left.show() print data1 = dp.SeriesSet([['A', 39, 'F'], ['B', 40, 'F'], ['C', 38, 'M']], ['Name', 'Age', 'Gender']) data2 = dp.Frame([['A', 'F', True], ['B', 'F', False], ['C', 'M', True]], ['Name', 'Gender', 'Married']) data3 = [['A', 'China'], ['B', 'US'], ['C', 'Japan'], ['D', 'England']] print dp.merge( data1, data2, data3, keys=0, keep_key='self', keep_same=False, ).show()
import DaPy as dp from DaPy.methods import LinearRegression as dp_lr data = dp.read('advertising.csv') lr_dp = dp_lr('numpy') lr_dp.fit(data['TV':'newspaper'], data['sales']) lr_dp.report.show()
def main(files): dp_ = dp.Frame(None, ['Load', 'Traverse', 'Sort', 'Save', 'Total']) np_ = dp.Frame(None, ['Load', 'Traverse', 'Sort', 'Save', 'Total']) pd_ = dp.Frame(None, ['Load', 'Traverse', 'Sort', 'Save', 'Total']) for i in range(100): dp_.append(test_DaPy(files)) np_.append(test_Numpy(files)) pd_.append(test_Pandas(files)) summary = dp.Frame( None, ['engine', 'Load', 'Traverse', 'Sort', 'Save', 'Total', 'Version']) summary.append([ 'DaPy', dp.mean(dp_['Load']), dp.mean(dp_['Traverse']), dp.mean(dp_['Sort']), dp.mean(dp_['Save']), dp.mean(dp_['Total']), dp.__version__ ]) summary.append([ 'Numpy', dp.mean(np_['Load']), dp.mean(np_['Traverse']), dp.mean(np_['Sort']), dp.mean(np_['Save']), dp.mean(np_['Total']), np.__version__ ]) summary.append([ 'Pandas', dp.mean(pd_['Load']), dp.mean(pd_['Traverse']), dp.mean(pd_['Sort']), dp.mean(pd_['Save']), dp.mean(pd_['Total']), pd.__version__ ]) file_ = dp.DataSet() file_.add(summary, 'Summary Table') file_.add(dp_, 'DaPy') file_.add(np_, 'Numpy') file_.add(pd_, 'Pandas') file_.save('Performance_result.xls')