def get(self,q_dict,step_minute=None): #query_dict #step(time): 5 (unit:minute) rst = self.rawget(q_dict,step_minute) data = rst['Data'] if not data: return rst if not q_dict.get('_pageby',''): return rst time1 = time.time() # 如果没有limit_rows参数,并且满足如下任一条件,则直接返回rawdata if (not q_dict.get('_limit_rows','')) and (len(q_dict['_pageby'])>1 or len(q_dict['_fields'])>1 or str(q_dict.get('_tstep','')) == '0' or q_dict.has_key('_rawdata')): return rst # 如果有limit_rows参数,则直接按规则过滤 elif q_dict.get('_limit_rows',''): new_data = self.limit_rows_get(data, q_dict) action = 'Filter' # 如果没有limit_rows参数,同时又都不满足上面几个条件,则会做数据填充和数据清洗 else: filled = self.datafill(data) new_data = self.dataclean(filled,q_dict.get('_dataclean',5)) action = 'Fill&Clean' rst['Data'] = new_data elapsed = round(time.time() - time1) new_count = len(new_data) log('[Data.%s] Elapsed: %s ResultCountChange %s --> %s' %(action, elapsed, rst['ResultCount'], new_count)) elapsed += float(rst['Elapsed'][:-1]) rst['Elapsed'] = str(elapsed)+'s' rst['ResultCount'] = str(new_count)+' rows' return rst
def _loadfile2db_one_by_one(self, file, tablename, fields_terminated): db = self.conn(readonly = False, tablename = tablename) cur = db.cursor() insert = 0 update = 0 err = 0 rec = 0 f = open(file,'rb') while True: try: l = f.next() except StopIteration: break rec += 1 values = l[:-1].split(fields_terminated) try: r = self.query_append(cur, tablename, values) if r == 'insert': insert += 1 elif r == 'update': update += 1 except: #print 'append error:',traceback.format_exc() log('append error. %s' %traceback.format_exc(), 1) err += 1 f.close() cur.close() db.close() return {'Records': rec, 'Added':insert , 'Updated':update, 'Errors': err}
def upload_result_file(request): """ 上传文件,做入库处理 POST 方法 参数: file=@filename 本地文件 _o=cdnbw 数据object名字 _s=201203201340 这份数据的时间戳,请注意该文件中的数据时间跨度不得超过定义的拆表时间区间(table_split) _t=, 数据的分隔符,默认空格 data_exists_action 数据如果存在(主键冲突)的处理方法,支持3种模式: ignore (Default) | replace | append ignore: 如果发现数据已经存在,则忽略;不存在的数据可以成功导入 replace: 如果已存在,则替换掉旧数据 append: 如果已存在,则往上累加(除主键外的其他字段,当存在百分比类似的数据时不适合使用append,除非确认可以对这些字段做累加) e.g. curl -F file=@filename -F _o=cdnbw -F _s=201203201340 -F '_t=,' -F data_exists_action=replace http://jdata.domain/api/uploadresultfile """ uploaddir = '/tmp/jdata/' if not isdir(uploaddir):mkdir(uploaddir) if request.method == 'POST': if request.META.has_key('HTTP_X_FORWARDED_FOR'): client = request.META['HTTP_X_FORWARDED_FOR'] else: client = request.META['REMOTE_ADDR'] file = request.FILES['file'] filename = file.name try:obj = request.POST['_o'] except:raise UploadDataParameterError("parameter `_o` is required") DM = DataModel(obj) try:timeline = request.POST['_s'] except:raise UploadDataParameterError("parameter `_s` is required") fields_terminated = request.POST.get('_t',' ') objdir = join(uploaddir,obj) if not isdir(objdir): mkdir(objdir) now = time.strftime('%Y%m%d%H%M%S',time.localtime()) tmpfile = join(objdir,now+'_'+timeline+'_'+filename) f = open(tmpfile,'wb') f.write(file.read()) f.close() data_exists_action = request.POST.get('data_exists_action','ignore') t1 = time.time() if data_exists_action in('ignore','replace', 'append'): tablename = DM.Data.gettablename(timeline) if data_exists_action in ('replace', 'append'): if not DM.Data.pk: raise ObjectPKNotFound('No PK Found For this Data `%s` ,so you can not use parameter `%s`' %(obj, data_exists_action)) rst = DM.Data.loadfile2db(tmpfile, tablename, fields_terminated, data_exists_action) log('upload: %s %s [%s] %ss %s %s' %(client, obj, data_exists_action, int(time.time()-t1),tmpfile,rst)) try: remove(tmpfile) except: log('remove resultfile Failed: %s' %tmpfile) return return_http_json(rst) else: raise UploadDataParameterError(" Unknown value `"+data_exists_action+"` for data_exists_action") else: raise UploadDataMethodError("""Please POST your data like this: curl -F file=@filename -F _o=cdnbw -F _s=201203201340 -F '_t= ' http://jdata.domain/api/uploadresultfile""")
def query_master(self, sql): if sql.lower().find('select') == 0: c = self._conn(self.dmc_master_r) log('MR: %s' %sql,2) else: c = self._conn(self.dmc_master_w) log('MW: %s' %sql,2) cur = c.cursor() cur.execute(sql) t = cur.fetchall() cur.execute('commit') cur.close() c.close() return t
def queryexecute(self,sql, tablename = ''): if sql.lower().find('select') == 0: db = self.conn(readonly = True, tablename = tablename) log('R: %s' %sql, 1) else: db = self.conn(readonly = False, tablename = tablename ) log('W: %s' %sql, 1) c=db.cursor() rows_examined = 0 try: try: c.execute(sql) except OperationalError , e: if e[0] == 1203: log('1203 Error max_user_connections, retry...', 1) time.sleep(0.01) c.execute(sql) elif e[0] == 1050: raise TableAlreadyExists(e[1]) else: raise e except ProgrammingError , e: #if sql.lower().find('select') == 0 and e[0] == 1146: if e[0] == 1146: raise TableNotExists('Table not exists: '+e[1]) else: raise e
def query(self,s,iscache=True,refresh=False,tablename = ''): if not iscache: return self.queryexecute(s, tablename) m=md5.new() m.update(s) ky=m.hexdigest() ky = 'jd_'+ky if not refresh: v = sqlcache.get(ky) if v: log('CACHE HIT:%s ProcessedRowsMem:%s' %(tablename, v['ProcessedRowsMem'])) return v else: info = 'CACHE MISS:' else: info = 'CACHE REFRESH:' try: rst = self.queryexecute(s, tablename) except TableNotExists,e: log('%s %s' %(info, e), 1) return {'Data':(),'ProcessedRowsMem':0,'ProcessedRowsDisk':0}
def upload_result_file(request): """ 上传文件,做入库处理 POST 方法 参数: file=@filename 本地文件 _o=cdnbw 数据object名字 _s=201203201340 这份数据的时间戳,请注意该文件中的数据时间跨度不得超过定义的拆表时间区间(table_split) _t=, 数据的分隔符,默认空格 data_exists_action 数据如果存在(主键冲突)的处理方法,支持3种模式: ignore (Default) | replace | append ignore: 如果发现数据已经存在,则忽略;不存在的数据可以成功导入 replace: 如果已存在,则替换掉旧数据 append: 如果已存在,则往上累加(除主键外的其他字段,当存在百分比类似的数据时不适合使用append,除非确认可以对这些字段做累加) e.g. curl -F file=@filename -F _o=cdnbw -F _s=201203201340 -F '_t=,' -F data_exists_action=replace http://jdata.domain/api/uploadresultfile """ uploaddir = '/tmp/jdata/' if not isdir(uploaddir): mkdir(uploaddir) if request.method == 'POST': if request.META.has_key('HTTP_X_FORWARDED_FOR'): client = request.META['HTTP_X_FORWARDED_FOR'] else: client = request.META['REMOTE_ADDR'] file = request.FILES['file'] filename = file.name try: obj = request.POST['_o'] except: raise UploadDataParameterError("parameter `_o` is required") DM = DataModel(obj) try: timeline = request.POST['_s'] except: raise UploadDataParameterError("parameter `_s` is required") fields_terminated = request.POST.get('_t', ' ') objdir = join(uploaddir, obj) if not isdir(objdir): mkdir(objdir) now = time.strftime('%Y%m%d%H%M%S', time.localtime()) tmpfile = join(objdir, now + '_' + timeline + '_' + filename) f = open(tmpfile, 'wb') f.write(file.read()) f.close() data_exists_action = request.POST.get('data_exists_action', 'ignore') t1 = time.time() if data_exists_action in ('ignore', 'replace', 'append'): tablename = DM.Data.gettablename(timeline) if data_exists_action in ('replace', 'append'): if not DM.Data.pk: raise ObjectPKNotFound( 'No PK Found For this Data `%s` ,so you can not use parameter `%s`' % (obj, data_exists_action)) rst = DM.Data.loadfile2db(tmpfile, tablename, fields_terminated, data_exists_action) log('upload: %s %s [%s] %ss %s %s' % (client, obj, data_exists_action, int(time.time() - t1), tmpfile, rst)) try: remove(tmpfile) except: log('remove resultfile Failed: %s' % tmpfile) return return_http_json(rst) else: raise UploadDataParameterError(" Unknown value `" + data_exists_action + "` for data_exists_action") else: raise UploadDataMethodError( """Please POST your data like this: curl -F file=@filename -F _o=cdnbw -F _s=201203201340 -F '_t= ' http://jdata.domain/api/uploadresultfile""" )
def rawget(self,q_dict,step_minute=None): #query_dict #step(time): 5 (unit:minute) if not step_minute: if not q_dict.has_key('_tstep'): step_minute=5 else: step_minute=q_dict['_tstep'] step_sec = 60*int(step_minute) time1=time.time() #time_field='date_format(date_add(concat('+self.timefield+',"00") ,interval -mod(right('+self.timefield+',2),'+str(step_minute)+') minute),"%Y/%m/%d %H:%i")' time_field = 'from_unixtime(floor((unix_timestamp(concat(%s,"00"))+28800)/%s)*%s-28800, "%s")' %(self.timefield, step_sec, step_sec, "%Y/%m/%d %H:%i") # +-28800 主要是为了去除unixtime不是从0点开始(从1970-01-01 08:00:00开始)的影响 querylist=[] display_fields = [] sql_s = 'select ' #select sql if str(step_minute) != '0': sql_s=sql_s+time_field+', ' display_fields.append('时间') if q_dict.get('_pageby',''): for j in q_dict['_pageby']: if j in self.fields_alias.keys(): display_fields.append(self.fields_alias[j][1]) j = self.fields_alias[j][0] else: try: fd_name = [o[2] for o in self.meta_db if o[0] == j][0] except: fd_name = j display_fields.append(fd_name) sql_s = sql_s + j + ',' for i in q_dict['_fields']: if i in self.fields_alias.keys(): display_fields.append(self.fields_alias[i][1]) i = self.fields_alias[i][0].replace('_minutes',str(step_minute)) i = i.replace('_tstep', str(step_minute)) else: try: fd_name = [o[2] for o in self.meta_db if o[0] == i][0] except: fd_name = i display_fields.append(fd_name) #check tstep:如果tstep=0,只允许sum,count(加),暂不支持减乘除,avg,max,min等 if str(step_minute) == '0': for x in ('/', '*', '-', 'avg', 'max', 'min'): if i.find(x) >= 0: raise UnsupportedQuery('Field `%s` is not supported with `_tstep=0`' %i) sql_s=sql_s+i+',' sql_s = sql_s[:-1]+' from ' sql_w = ' where ' #where sql if not q_dict.has_key('_s'): stime=time.strftime('%Y%m%d0000') etime=time.strftime('%Y%m%d%H%M') else: stime=q_dict['_s'] if not q_dict.has_key('_e'): etime=time.strftime('%Y%m%d%H%M') else: etime=q_dict['_e'] try: querylist=self.generate_querylist(stime,etime) except ValueError: raise URLParameterError('Time Format Error `start`:'+stime+' `end`:'+etime) for i in q_dict['_filters']: sql_w = sql_w+i+' and ' if str(step_minute) != '0': if q_dict.get('_pageby',''): #sql_g = ' group by '+time_field+','+q_dict['_pageby'] sql_g = ' group by '+time_field for j in q_dict['_pageby']: if j in self.fields_alias.keys(): j = self.fields_alias[j][0] sql_g = sql_g + ',' + j else: sql_g = ' group by '+time_field else: if q_dict.get('_pageby',''): #sql_g = ' group by '+q_dict['_pageby'] sql_g = ' group by ' for j in q_dict['_pageby']: if j in self.fields_alias.keys(): j = self.fields_alias[j][0] sql_g = sql_g + j + ',' sql_g = sql_g[:-1] else: sql_g = '' mythread = [] debugout = [] for i in querylist: sql=sql_s+i[0]+sql_w+self.timefield+' >= '+i[1]+' and '+self.timefield+' <= '+i[2]+sql_g args=(sql,step_minute,q_dict.get('_pageby',''),q_dict.has_key('_refresh'), i[0]) t=Thread(target=self.multithreadquery,args=args) mythread.append(t) debugout.append( dict(zip(('sql','_tstep','_pageby','_refresh','tablename'), args) )) if q_dict.has_key('_debug'): raise DebugPrint(debugout) for i in mythread: i.start() for i in mythread: i.join() data= list(self.multithreadresult) data.sort() rst_cnt = len(data) querys = len(querylist) elapsed = round(time.time()-time1,3) log('[Data.rawget] Elapsed:%s QuerysALL:%s FromMem/Disk/NotFound:%s/%s/%s \ ProcessedRowsDisk:%s ProcessedRowsMem:%s ResultCount:%s %s' %(elapsed, querys, self.querys_frommem, self.querys_fromdisk, self.querys_notfound, self.rowsfromdisk, self.rowsfrommem, rst_cnt, q_dict)) rst = {'Data':data, 'Metadata':display_fields, 'Elapsed':str(elapsed)+'s', 'ProcessedRowsDisk':str(self.rowsfromdisk)+' rows', 'ProcessedRowsMem':str(self.rowsfrommem)+' rows', 'ResultCount':str(rst_cnt)+' rows'} self.multithreadresult=() self.rowsfromdisk = 0 self.rowsfrommem = 0 self.querys_notfound = 0 self.querys_frommem = 0 self.querys_fromdisk = 0 return rst
if not refresh: v = sqlcache.get(ky) if v: log('CACHE HIT:%s ProcessedRowsMem:%s' %(tablename, v['ProcessedRowsMem'])) return v else: info = 'CACHE MISS:' else: info = 'CACHE REFRESH:' try: rst = self.queryexecute(s, tablename) except TableNotExists,e: log('%s %s' %(info, e), 1) return {'Data':(),'ProcessedRowsMem':0,'ProcessedRowsDisk':0} except TableNotExistsInDMC,e: log('%s %s' %(info, e), 1) return {'Data':(),'ProcessedRowsMem':0,'ProcessedRowsDisk':0} log('%s:%s ProcessedRowsDisk:%s' %(info, tablename, rst['ProcessedRowsDisk'])) mem_rst = {} mem_rst['Data'] = rst['Data'] mem_rst['ProcessedRowsMem'] = rst['ProcessedRowsDisk'] mem_rst['ProcessedRowsDisk'] = 0 sqlcache.set(ky,mem_rst,60*60*24) return rst def multithreadquery(self,sql,step_minute,pageby,refresh,tablename): rst=self.query(sql,refresh=refresh, tablename = tablename) t = rst['Data'] rows_disk = rst['ProcessedRowsDisk'] rows_mem = rst['ProcessedRowsMem']