def MaxTE(self, Cursor, Date, Limit=100): '''Calcula Maximo de precio en fila debe llevar fecha y limite ''' Qry = """ select id_ofertas_energia_david, hora, central, precio_per_mw_1, precio_per_mw_2, precio_per_mw_3, precio_per_mw_4, precio_per_mw_5, precio_per_mw_6, precio_per_mw_7, precio_per_mw_8, precio_per_mw_9, precio_per_mw_10, precio_per_mw_11 from ofertas_energia_david where tipo_reporte='TE' and fecha_inicial='{0}' limit {1} """.format(Date, Limit) Cursor.execute(Qry) #da formato pandas df = as_pandas(Cursor) #pasando a formato R y a R df = com.convert_to_r_dataframe(df) #print type(rdf) ro.r('source("./Rfunctions/max.R")') ro.globalenv['tabla'] = df ro.r('Out <- Rmax(tabla)') print ro.r('Out')
def test_zBizRule2(self): cs = self.icon.cursor() try: cs.execute( """ select max(order_date) as maxd from db.fct_total_order_trans """ ) df = as_pandas(cs) maxd = str(df["maxd"][0]) lastyeard = TableTestCase.TableTestCase.GetLastYearSameDay(maxd) sql = """select a.store_key,a.t_amt/b.t_amt from (select sum(real_amt) t_amt,store_key from db.fct_total_order_trans where order_date='%s' and data_source='store' group by store_key) a join (select sum(real_amt) t_amt,store_key from db.fct_total_order_trans where order_date='%s' group by store_key) b on a.store_key=b.store_key where a.t_amt/b.t_amt is not null and (a.t_amt/b.t_amt>3 or a.t_amt/b.t_amt <0.3)""" % ( maxd, lastyeard) cs.execute(sql) df = as_pandas(cs) rs = len(df) if rs > 20: self.raiseFailure("全渠道交易表:去年同日的销售量对比,发现差异过大的数据。<br />%s" % df.to_html()) if rs in range(1, 21): self.raiseError("全渠道交易表:去年同日的销售量对比,发现差异过大的数据。<br />%s" % df.to_html()) finally: cs.close()
def api_payrate_zhexian(): now_time, seven_days_time = get_now_and_7days_time() if 'provinceid' in request.args and 'cityid' in request.args and "areaid" in request.args and "shopid" not in request.args: sql=get_sql("sql","sql_payrate_zhexian1")\ .replace("get_provinced_id",request.args['provinceid'])\ .replace("get_city_id",request.args['cityid'])\ .replace("get_area_id",request.args['areaid'])\ .replace("seven_days_time","'"+seven_days_time+"'")\ .replace("now_time","'"+now_time+"'") print (sql) cur.execute(sql) df = as_pandas(cur) res = trans_array(df) return res if 'provinceid' in request.args and 'cityid' in request.args and "shopid" not in request.args: sql = get_sql("sql", "sql_payrate_zhexian2") \ .replace("get_provinced_id", request.args['provinceid']) \ .replace("get_city_id", request.args['cityid']) \ .replace("seven_days_time", "'"+seven_days_time+"'") \ .replace("now_time", "'"+now_time+"'") #print (sql) cur.execute(sql) df = as_pandas(cur) res = trans_array(df) return res if 'provinceid' in request.args and "shopid" not in request.args: sql = get_sql("sql", "sql_payrate_zhexian3") \ .replace("get_provinced_id", request.args['provinceid']) \ .replace("seven_days_time", "'"+seven_days_time+"'") \ .replace("now_time", "'"+now_time+"'") #print (sql) cur.execute(sql) df = as_pandas(cur) res = trans_array(df) return res if 'shopid' in request.args : sql = get_sql("sql", "sql_payrate_zhexian4") \ .replace("shop__", request.args['shopid']) \ .replace("seven_days_time", "'" + seven_days_time + "'") \ .replace("now_time", "'" + now_time + "'") #print (sql) cur.execute(sql) df = as_pandas(cur) res = trans_array(df) return res else: sql = get_sql("sql", "sql_payrate_zhexian5") \ .replace("seven_days_time", "'"+seven_days_time+"'") \ .replace("now_time", "'"+now_time+"'") print (sql) cur.execute(sql) df = as_pandas(cur) res = trans_array(df) return res
def api_payrate_bing(): yes_time = get_yes_time() if 'provinceid' in request.args and 'cityid' in request.args and "areaid" in request.args and "shopid" not in request.args: sql=get_sql("sql","sql_payrate1")\ .replace("province_",request.args['provinceid'])\ .replace("city_",request.args['cityid'])\ .replace("area_",request.args['areaid']) \ .replace("yes_time", "'" + yes_time + "'") #print (sql) cur.execute(sql) df = as_pandas(cur) res = trans_array3(df) return res if 'provinceid' in request.args and 'cityid' in request.args and "shopid" not in request.args: sql = get_sql("sql", "sql_payrate2") \ .replace("province_", request.args['provinceid']) \ .replace("city_", request.args['cityid']) \ .replace("yes_time", "'" + yes_time + "'") #print (sql) cur.execute(sql) df = as_pandas(cur) res = trans_array3(df) return res if 'provinceid' in request.args and "shopid" not in request.args: sql = get_sql("sql", "sql_payrate3") \ .replace("province_", request.args['provinceid']) \ .replace("yes_time", "'" + yes_time + "'") #print (sql) cur.execute(sql) df = as_pandas(cur) res = trans_array3(df) return res if 'shopid' in request.args : sql = get_sql("sql", "sql_payrate4") \ .replace("shop__", request.args['shopid']) \ .replace("yes_time", "'" + yes_time + "'") #print (sql) cur.execute(sql) df = as_pandas(cur) res = trans_array3(df) return res else: sql = get_sql("sql", "sql_payrate5") \ .replace("yes_time", "'" + yes_time + "'") #print (sql) cur.execute(sql) df = as_pandas(cur) res = trans_array3(df) return res
def main(): # Loading data from MySQL as an example conn = MySQLConnection() # in the query below table_name has to be replaced to some existing table query = ''' select * from table_name limit 10 ''' conn.execute(query) res = as_pandas(conn.cur) print(res.head()) res.loc[res.shape[0]] = [None, 'TEST'] print('Table is loaded') # Loading Table into MySQL (another scheme) # in the query below schema_name has to be replaced to some existing schema conn.loadTableIntoMySQL(res, table='schema_name.test_table', replace=True, print_q=False) # Deleting created table conn.execute('drop table if exists schema_name.test_table') print('Done')
def ExecuteHiveSQL(sql, user, password, host, port): try: impala_logger = logging.getLogger('impala') #Enable Only CRITICAL Logger impala_logger.setLevel(logging.CRITICAL) ConnectionErrors = (Error, DatabaseError, InternalError, OperationalError, ProgrammingError, IntegrityError, DataError, NotSupportedError) # Establish Hive connection conn = getHiveConnection(user, password, host, int(port)) cursor = conn.cursor() cursor.execute(sql) df_output = as_pandas(cursor) conn.close() return df_output except ConnectionErrors as e: logging.error("ExecuteHiveSQL(): Database Error - " + str(e)) print(e) raise except Exception as e: logging.error("ExecuteHiveSQL(): Failed - " + str(e)) print(e)
def impala_query(sql): impala_HMS_HOST=os.getenv('IMPALA_HOST','url') impala = connect(host=impala_HME_HOST, port=21050, use_ssl=False, auth_mechanism='GSSAPI', kerberos_service_name='impala') proc_start = time.time() # Time Check #-------------------------- impala_cursor = impala.cursor() impala_cursor.execute(sql) df = as_pandas(impala_cursor) #-------------------------- proc_end = time.time() columns = df.shape[1] nrows=len(df) # Calculation #-------------------------- memory_usage = round(sum(df.memory_usage(deep=True))/1024**2,6) read_time = round(proc_end - proc_start,6) impala.close() return df
def run_sql(sql, username, password): conn = connect(host='10.2.8.96', auth_mechanism='PLAIN', port=21050, user=username, password=password) cursor = conn.cursor() cnt = 1 if ';' in sql: sql_list = sql.rstrip().split(';') # print(type(sql_list)) if len(sql_list[-1]): for s in sql_list: print("runing sql @ %s" % cnt) cursor.execute(s) cnt += 1 else: sql_list.pop() for s in sql_list: # logging.info('executing sql'+s) print("runing sql @ %s" % cnt) cursor.execute(s) cnt += 1 else: print("runing sql @ %s" % cnt) cursor.execute(sql) return as_pandas(cursor) if cursor.description != None else 'null'
def main(): conn = OracleConnection() # in the query below table_name has to be replaced to some existing table query = ''' select * from table_name where rownum <= 5 ''' conn.execute(query) res = as_pandas(conn.cur) print(res.head(3)) print('Table is here') res.loc[res.shape[0]] = [None, 1223, 2232, 3232, 'TEST'] # in the query below schema_name has to be replaced to some existing schema conn.loadTableIntoOracle(res, table='schema_name.test_table', replace=True, print_q=False) # Deleting created table conn.execute('drop table schema_name.test_table') conn.execute('commit') print('Done')
def run_query(self, query): with dbclient(pyodbc_connect(self.dc, self.database)) as cursor: res = cursor.execute(self.query) getlogger().debug( '\n>>>>>>>>>>>>>>>>type of cursor.execute is {}'.format( type(res))) return as_pandas(res)
def hive_query(sql): HIVE_HMS_HOST = os.getenv('HIVE_HS2_HOST','url') hive = connect(host=HIVE_HMS_HOST, port=10000, use_ssl=False, auth_mechanism='GSSAPI', kerberos_service_name='hive') proc_start = time.time() # Time Check #-------------------------- hive_cursor = hive.cursor() hive_cursor.execute(sql) df = as_pandas(hive_cursor) #-------------------------- proc_end =time.time() columns = df.shape[1] nrows=len(df) # Calculation #-------------------------- memory_usage = round(sum(df.memory_usage(deep=True))/1024**2,6) read_time = round(proc_end - proc_start,6) hive.close() return df
def taonewslabel(): pipe = rds.pipeline() #key_list = rds.keys('*') #for key in key_list: # if "taonewslabel_" in key: # pipe.delete(key) #pipe.execute() #pipe = rds.pipeline() #print "taonewslabel delete" cur.execute( "select userid,GROUP_CONCAT(cast(type as string)) as tags from reports.taonew_label_user where logday='%s' group by userid;" % yestoday) df = as_pandas(cur) dict1 = dict() dict1['error_code'] = 0 for index, row in df.iterrows(): dict1['userid'] = row['userid'] list = row['tags'].split(',') list = map(int, list) dict1['tags'] = list json_str = json.dumps(dict1) pipe.set('taonewslabel_' + row['userid'], json_str) if index % 1000 == 0: pipe.execute() pipe.execute() print "taonewslabel finished."
def run_sql(sql): conn = connect(host='172.17.69.25', auth_mechanism='PLAIN', port=21050, user='******', password='******') cursor = conn.cursor() cnt = 1 # 2.2 对impala执行SQL查询 if ';' in sql: sql_list = sql.rstrip().split(';') # print(type(sql_list)) if len(sql_list[-1]): for s in sql_list: print("runing sql @ %s" % cnt) cursor.execute(s) cnt += 1 else: sql_list.pop() for s in sql_list: print("runing sql @ %s" % cnt) cursor.execute(s) cnt += 1 else: print("runing sql @ %s" % cnt) cursor.execute(sql) # 2.3 把结果转化成pandas的dataframe格式,以便进行数据分析 # df = as_pandas(cursor) # print(df) return as_pandas(cursor) if cursor.description != None else 'null'
def aistudy_knowledge_question(param=None): if param is None: param = {'year': '2018', 'term_id': '2', 'subject_id': '', 'grade_id': '7', 'city_id': '020'} sql = """ select -- ad.cuc_num, -- cla.knowledge_id, -- cla.knowledge_name,ad ad.problem_id as item_id, ad.difficulty as difficulty, ad.discrimination, ad.new_difficulty -- ad.isdel --cla.id, -- cla.create_time -- rank() over(partition by cla.knowledge_id,cla.knowledge_name order by cla.id) as order_num from odata.ods_ai_study_ad_paper_question ad --join odata.ods_ai_study_ad_cla_knowledge cla on ad.knowledge_id=cla.knowledge_id """ % param today = datetime.today().strftime("%Y-%m-%d") file_name = "item_profile_%s.pkl" % today if os.path.exists(file_name): return pd.read_pickle(file_name) impala_cursor.execute(sql) df = as_pandas(impala_cursor) df.to_pickle(file_name) return df
def MaxMonth(self, Cursor, Month, Units, Limit=1): '''get maximum of month by units''' File=open('MaxMonth.csv','w') File.write("unidad, fecha, hora, costo_maximo\n") for item in Units: Qry = """ select * from( select unidad, fecha_inicial, hora,precio_per_mw_10, max(precio_per_mw_10) over (partition by unidad order by fecha_inicial, hora asc rows between unbounded preceding and unbounded following ) as costo_maximo from ofertas_energia_david where (tipo_reporte='TE' and unidad='{1}' and month(fecha_inicial)={0} and precio_per_mw_10 > 0) ) data where data.costo_maximo=precio_per_mw_10 and data.costo_maximo > 0 limit {2} """.format(Month, item, Limit) Cursor.execute(Qry) #da formato pandas df = as_pandas(Cursor) print df try: File.write("{0}, {1}, {2}, {3}\n".format(str(df['data.unidad'][0]), str(df['data.fecha_inicial'][0]), str(df['data.hora'][0]), str(df['data.costo_maximo'][0]))) except IndexError: print item + " vacio" pass File.close()
def impala_query(sql): conn = connect(**impala_config) cur = conn.cursor() cur.execute(sql) df = as_pandas(cur) conn.close() return df
def test(self): """测试""" cursor = self.conn.cursor() cursor.execute('show tables like "dm*_sdr_*dm"') df = as_pandas(cursor) data = df.to_dict('list') print(data)
def impala_connect(sql, **kwargs): # impala host = kwargs.get("host", 'impala.bjds.belle.lan') port = kwargs.get("port", 21051) timeout = kwargs.get("timeout", 3600) # hive # host = kwargs.get("host", 'impala.bjds.belle.lan') # port = kwargs.get("port", 10008) # timeout = kwargs.get("timeout", 3600) user = kwargs.get("user", "lv.d.sz") password = kwargs.get("password", 'JHjLXpyQ') kerberos_service_name = kwargs.get("kerberos_service_name", "impala") conn = connect(host=host, port=port, timeout=timeout, user=user, password=password, kerberos_service_name=kerberos_service_name, auth_mechanism='LDAP') cur = conn.cursor(user=user) if sql is not None: cur.execute(sql) try: df = as_pandas(cur) except: return cur return df
def getData(sql): '''作用:根据输入的日期,获取特征数据''' con = connect(**INCEPTOR_CONFIG) cur = con.cursor() cur.execute(sql) df_data = as_pandas(cur) cur.close() return df_data
def impala_db(hive_sql): conn = connect(host=get_config("impala", "host"), port=get_config("impala", "port"), database=get_config("impala", "database"), auth_mechanism='PLAIN') curl = conn.cursor() curl.execute(hive_sql) return as_pandas(curl)
def read_df(self, statement): self.create_connect() self.cursor.execute(statement) df = as_pandas(self.cursor) self.close_connect() return df
def report(): cursor = get_hive_cursor() if cursor is None: return render_template('/main/bi_connection_issue.html') # FIXME we probably want to create aggregates on hadoop # and cache them rather than returning the whole data # set here # we need to ignore monitoring pings which have rating user_id = -1 # and movie_id = -1 try: cursor.execute( "select * from movie_ratings where customer_id <> '-1' and movie_id <> '-1'", configuration={ 'hive.mapred.supports.subdirectories': 'true', 'mapred.input.dir.recursive': 'true' }) except: return render_template('/main/bi_connection_issue.html') df = as_pandas(cursor) count = df.shape[0] if count == 0: return render_template('/main/bi_no_records.html') from bokeh.charts import Bar, output_file, show fig = Bar( df, label='movie_ratings.rating', values='movie_ratings.rating', agg='count', title='Distribution of movie ratings', legend=False ) fig.plot_height = 400 fig.xaxis.axis_label = 'Rating' fig.yaxis.axis_label = 'Count ( Rating )' js_resources = INLINE.render_js() css_resources = INLINE.render_css() script, div = components(fig) html = flask.render_template( '/main/embed.html', plot_script=script, plot_div=div, js_resources=js_resources, css_resources=css_resources, ) return encode_utf8(html)
def select_sql_df(self, sql): """ 查找,返回一个Dataframe :param sql: :return: """ self.cur.execute(sql) df = as_pandas(self.cur) return df
def dhive(): # s ='create external table aoi_test(panelId string,sn string,lineCode string,stationCode string,deviceId string,result string,AOI_errorCode map<string,string>, AOI_info map<string,string>,AOI_tbTag string,AOI_singleTag string,AOI_workOrder string,AOI_data string,AOI_time string,AOI_time_cnt_num int) ROW FORMAT SERDE \'org.openx.data.jsonserde.JsonSerDe\' STORED AS TEXTFILE location \'/pcb/AOI/\'; ' from impala.dbapi import connect from impala.util import as_pandas conn = connect(host='10.141.212.26', port=10000, database='pcb',auth_mechanism='PLAIN') cur = conn.cursor() cur.execute('select * from aoi WHERE AOI_time_cnt_num = 1') df = as_pandas(cur) print(df)
def read_to_dataframe(self, sql): try: from impala.util import as_pandas self.cursor = self.conn.cursor() self.cursor.execute(sql) self.df = as_pandas(self.cursor) except: raise return(self.df)
def select_dataframe(sql): """ Return a pandas data frame with the results of a sql query to impala :param sql: :return: """ cur = impala_conn.cursor() cur.execute(sql) return as_pandas(cur)
def _get_normalization_data(sql): cur = impala_cli.cursor(sql, True) df = as_pandas(cur) ret = {} ret['time'] = df['c_date'].tolist() ret['datas'] = __parse_data(df) logger.info('_get_normalization_data data:%s', ret) return ret
def test_zBizRule6(self): cs = self.icon.cursor() try: cs.execute( """ select distinct order_date from db.fct_total_order_trans where data_source = 'bs' and order_date >='2018-09-01'""") df = as_pandas(cs) rs = df["order_date"].tolist() breakdates = TableTestCase.TableTestCase.ConsecutivenessDateTest( rs, "%Y-%m-%d") if len(breakdates) > 0: self.raiseFailure("全渠道交易表官网数据存在中断,在%s和%s之间 " % (breakdates[0], breakdates[1])) cs.execute( """select distinct order_date from db.fct_total_order_trans where data_source = 'tmall'""" ) df = as_pandas(cs) rs = df["order_date"].tolist() breakdates = TableTestCase.TableTestCase.ConsecutivenessDateTest( rs, "%Y-%m-%d") if len(breakdates) > 0: self.raiseFailure("全渠道交易表Tmall数据存在中断,在%s和%s之间 " % (breakdates[0], breakdates[1])) cs.execute( """select distinct order_date from db.fct_total_order_trans where data_source like '%o2o%' and order_date >='2016-09-01'""") df = as_pandas(cs) rs = df["order_date"].tolist() breakdates = TableTestCase.TableTestCase.ConsecutivenessDateTest( rs, "%Y-%m-%d") if len(breakdates) == 0: pass # Tmall switch on 2018/5/16 elif len(breakdates) in (2, 3) and str(breakdates[0]).strip() == '2018-05-15 00:00:00' \ and str(breakdates[1]).strip() == '2018-05-17 00:00:00': pass else: self.raiseFailure("全渠道交易表O2O数据存在中断,在 %s " % ','.join([str(s) for s in breakdates])) finally: cs.close()
def hive2dataframe(sql_context): conn = connect(host='192.168.1.73', port=10000, auth_mechanism='PLAIN', user='******', password='******', database='ods') cursor = conn.cursor() cursor.execute(sql_context) return as_pandas(cursor)
def monitor(): auth = request.authorization if not auth or not check_auth(auth.username, auth.password): data = { "error": "Permission denied." } response = app.response_class( response=json.dumps(data), status=550, mimetype='application/json' ) return response cursor = get_hive_cursor() if cursor is None: data = { "error": "Could not connect to Hive" } response = app.response_class( response=json.dumps(data), status=500, mimetype='application/json' ) return response timestamp = time.time() message = '{0},{1},{2}'.format(-1, -1, timestamp) messagehub_client.send_message( message ) time.sleep(70) cursor.execute( 'select * from movie_ratings where rating = {0}'.format(timestamp), configuration={ 'hive.mapred.supports.subdirectories': 'true', 'mapred.input.dir.recursive': 'true' }) df = as_pandas(cursor) count = df.shape[0] if count == 1: data = { "ok": "App rating found in hadoop." } response = app.response_class( response=json.dumps(data), status=200, mimetype='application/json' ) return response else: data = { "error": "App rating not found in hadoop." } response = app.response_class( response=json.dumps(data), status=500, mimetype='application/json' ) return response
def impala_run_and_upload(query_string, metric_name): """ Connects to Avvo production cluster and retrieves dataframe :param str query_string: an impala query to be run :rtype pd.DataFrame :return: dataframe with results """ if not isinstance(query_string, str): raise TypeError('query_string must be a string!') conn = connect(host='dn1wow.prod.avvo.com', port=21050, database='tmp_data_dm', auth_mechanism="GSSAPI", kerberos_service_name='impala') cur = conn.cursor() cur.execute(query_string) output_df = as_pandas(cur) drop_table = 'drop table if exists rd_%s_temp' % metric_name cur.execute(drop_table) # for finding the date column headers = output_df.head() matching = [s for s in headers if 'date' in s] date_column = matching[0] print 'creating %s temp table \n' % metric_name write_query_string = "create table rd_%s_temp as %s;" % (metric_name, query_string) cur.execute(write_query_string) time.sleep(2) # create the main table if it doesnt exist print 'creating %s table \n' % metric_name create_table = 'CREATE TABLE if not exists rd_%s as ( select * from rd_%s_temp order by 1)' \ % (metric_name, metric_name) cur.execute(create_table) time.sleep(2) print 'updating %s table \n' % metric_name query_string = "with new_data as (select * from rd_%s where \ %s not in (select %s from rd_%s_temp)) \ insert into rd_%s_temp select * from new_data order by %s" % ( metric_name, date_column, date_column, metric_name, metric_name, date_column) cur.execute(query_string) time.sleep(2) replace_query = "with new_data as (select * from rd_%s_temp) \ insert overwrite rd_%s select * from new_data order by %s" % ( metric_name, metric_name, date_column) cur.execute(replace_query) conn.close() return output_df
def __execute(self, sql, ret='pandas'): ret_val = None with connect(**self.connection) as con: cur = con.cursor() cur.execute(sql) if ret == 'pandas': ret_val = as_pandas(cur) elif ret == 'status': ret_val = cur.status() return ret_val
def main(): HOST='127.0.0.1' PORT=21050 SQL = 'select * from xxx limit 1' try: impala_cli = Impala(HOST, PORT) cur = impala_cli.cursor(SQL, True) df = as_pandas(cur) print df.test.describe() except Exception as e: print e.message;
def test_zBizRule9(self): cs = self.icon.cursor() try: cs.execute( """ select count(*) ct from db.fct_total_order_trans where customer_key is null """ ) df = as_pandas(cs) ct = int(df["ct"][0]) if ct > 0: self.raiseFailure("全渠道交易表:存在空的customer_key") finally: cs.close()
def run_sql(sql, user, pswd): conn = connect(host='10.2.8.91', auth_mechanism='PLAIN', port=21050, user=user, password=pswd) cursor = conn.cursor() cursor.execute(sql) # df = as_pandas(cursor) # print(df) return as_pandas(cursor) if cursor.description != None else 'null'
def _ingest_summary(self): # get date parameters. yr = self._date[:4] mn = self._date[4:6] dy = self._date[6:] self._logger.info("Getting ingest summary data for the day") ingest_summary_cols = ["date","total"] result_rows = [] df_filtered = pd.DataFrame() # get ingest summary. query_to_load=(""" SELECT tryear, trmonth, trday, trhour, trminute, COUNT(*) as total FROM {0}.{1} WHERE y={2} AND m={3} AND d={4} AND unix_tstamp IS NOT NULL AND sip IS NOT NULL AND sport IS NOT NULL AND dip IS NOT NULL AND dport IS NOT NULL AND ibyt IS NOT NULL AND ipkt IS NOT NULL AND tryear={2} AND cast(treceived as timestamp) IS NOT NULL GROUP BY tryear, trmonth, trday, trhour, trminute; """).format(self._db,self._table_name, yr, mn, dy) results = impala.execute_query(query_to_load) if results: df_results = as_pandas(results) #Forms a new dataframe splitting the minutes from the time column df_new = pd.DataFrame([["{0}-{1}-{2} {3}:{4}".format(val['tryear'],val['trmonth'],val['trday'], val['trhour'], val['trminute']), int(val['total']) if not math.isnan(val['total']) else 0 ] for key,val in df_results.iterrows()],columns = ingest_summary_cols) value_string = '' #Groups the data by minute sf = df_new.groupby(by=['date'])['total'].sum() df_per_min = pd.DataFrame({'date':sf.index, 'total':sf.values}) df_final = df_filtered.append(df_per_min, ignore_index=True).to_records(False,False) if len(df_final) > 0: query_to_insert=(""" INSERT INTO {0}.flow_ingest_summary PARTITION (y={1}, m={2}, d={3}) VALUES {4}; """).format(self._db, yr, mn, dy, tuple(df_final)) impala.execute_query(query_to_insert) else: self._logger.info("No data found for the ingest summary")
def take(self, n): """Return `n` rows as a pandas `DataFrame` Distributed and no notion of order, so not guaranteed to be reproducible. """ alias = _random_id('inline_', 4) table_ref = InlineView(self._query_ast.to_sql(), alias) select_list = [SelectItem(table_name=TableName(table_ref.name))] # SELECT alias.* limit_elt = LimitElement(Literal(n), None) ast = SelectStmt(select_list, table_ref, limit=limit_elt) bdf = BigDataFrame(self._ic, ast) return as_pandas(bdf.__iter__())
def desc_total_sales_volumn(year): # Redis read cache value REDIS_KEY = "desc_total_sales_vol:{0}".format(year) cached_data = redis_io.read_transaction(REDIS_KEY) if cached_data != None: return ast.literal_eval(cached_data) # conn = connect(host='salest-master-server', port=21050) cur = conn.cursor() # daily transaction agg cur.execute('USE salest') bIsRealTimeUpdated = REFRESH_IMPALA_TABLE(cur, year) cur.execute(""" SELECT year_month_day, SUM(num_of_product) AS num_of_product, SUM(sales_amount) AS total_amount FROM ( SELECT SUBSTR(date_receipt_num,1,10) AS year_month_day, num_of_product, sales_amount FROM """ + GET_IMPALA_TB_NAME(year) + """ WHERE SUBSTR(date_receipt_num,1,4) = '""" + year + """' ) view_tr_recipt GROUP BY year_month_day ORDER BY year_month_day ASC """ ) df_tr_agg_daily = as_pandas(cur) conn.close() series_sum = df_tr_agg_daily[['num_of_product','total_amount']].sum() series_sum.name = 'sum' df_desc = df_tr_agg_daily.describe().append(series_sum) df_desc['num_of_product'] = df_desc['num_of_product'].apply(lambda v: round(v)) df_desc['total_amount'] = df_desc['total_amount'].apply(lambda v: round(v)) df_desc.fillna(0, inplace=True) cached_data = df_desc.to_dict() if bIsRealTimeUpdated == False: # Redis save cache value redis_io.write_transaction(REDIS_KEY, cached_data) # cached_data = redis_io.read_transaction(REDIS_KEY) cached_data = ast.literal_eval(cached_data) return cached_data
def get_timebase_data_on_past_specific_date(cur_date): # Redis read cache value REDIS_KEY_PREFIX = "past_timebase_data_of" def get_cache_value(cur_date): return redis_io.read_dict_transaction(REDIS_KEY_PREFIX + ":" + cur_date) cached_data = get_cache_value(cur_date) if cached_data != None: return cached_data conn = connect(host='salest-master-server', port=21050) cur = conn.cursor() cur.execute('USE salest') date_list = tuple(get_past_target_date(cur_date)) cur.execute( """ SELECT time_hour, CAST(SUM(sales_amount) as INTEGER) AS total_amount, COUNT(sales_amount) as num_of_transaction, COUNT(DISTINCT year_month_day) as date_count FROM( SELECT SUBSTR(date_receipt_num,1,10) AS year_month_day, SUBSTR(tr_time,1,2) AS time_hour, sales_amount FROM ext_tr_receipt WHERE SUBSTR(date_receipt_num,1,10) IN ('%s') """ % date_list + """ ) view_tr_total_amount_by_dayofweek GROUP BY time_hour ORDER BY time_hour ASC """ ) df_by_hour = as_pandas(cur) conn.close() df_by_hour.set_index('time_hour',inplace=True) df_by_hour = df_by_hour.reindex([[str(i) for i in np.arange(10,24)]],fill_value=0) dict_result = df_by_hour['total_amount'].to_dict() dict_result['date'] = date_list[0] redis_io.write_dict_transaction(REDIS_KEY_PREFIX + ":" + cur_date, dict_result, 60*60) ret_dict = get_cache_value(cur_date) return ret_dict
def AvgMonth(self, Cursor, Month, Units, Limit=1): '''GetAverage of month by units''' File=open('AvgMonth.csv', 'w') File.write("Unidad, promedio\n") for item in Units: Qry = """ select avg(precio_per_mw_10) from ofertas_energia_david where tipo_reporte='TE' and unidad='{1}' and month(fecha_inicial)={0} and precio_per_mw_10 > 0 limit {2} """.format(Month, item,Limit) Cursor.execute(Qry) #da formato pandas df = as_pandas(Cursor) File.write("{0}, {1}\n".format(item,str(df['c0'][0]))) File.close()
def get_product_data(product_name): # Redis read cache value REDIS_KEY_PREFIX = "popular_product_info" def get_cache_value(product_name): cache_data = redis_io.read_dict_transaction(REDIS_KEY_PREFIX + ":" + product_name, ['product_code','price']) if cache_data == None: return None dict_data = {} dict_data['product_code'] = cache_data[0] dict_data['price'] = cache_data[1] return dict_data cached_data = get_cache_value(product_name) if cached_data != None: return cached_data conn = connect(host='salest-master-server', port=21050) cur = conn.cursor() cur.execute('USE salest') queryStr = """SELECT product_name,price,product_code FROM ext_menumap_info""" cur.execute(queryStr) df_categories = as_pandas(cur) df_categories = df_categories[df_categories.price != 0] #df_categories.set_index('product_name', inplace=True) for idx,row in df_categories.iterrows(): key = "{0}:{1}".format(REDIS_KEY_PREFIX,row.product_name) value = row[['product_code','price']].to_dict() redis_io.write_dict_transaction(key, value, 60*60*24*30) cached_data = redis_io.read_dict_transaction(REDIS_KEY_PREFIX + ":" + product_name, ['product_code','price']) return get_cache_value(product_name)
def collect(self): """Return the BDF data to the client as a pandas DataFrame""" return as_pandas(self.__iter__())
def analysis_timebase_sales_amount(year, day_of_week): # Redis read cache value REDIS_KEY = "timebase_sales_amount:{0}:{1}".format(year,day_of_week) cached_data = redis_io.read_transaction(REDIS_KEY) if cached_data != None: return ast.literal_eval(cached_data) # conn = connect(host='salest-master-server', port=21050) cur = conn.cursor() cur.execute('USE salest') bIsRealTimeUpdated = REFRESH_IMPALA_TABLE(cur, year) start_date = "%s/01/01" % year end_date = "%s/12/31" % year if(day_of_week=='All'): target_date_idx = pd.date_range(start_date,end_date) else: target_date_idx = pd.date_range(start_date,end_date, freq=day_of_week) target_date_arr = target_date_idx.strftime('%Y-%m-%d') target_date_tuple = tuple(target_date_arr) cur.execute( """ SELECT time_hour, CAST(SUM(sales_amount) as INTEGER) AS total_amount, COUNT(sales_amount) as num_of_transaction, COUNT(DISTINCT year_month_day) as date_count FROM( SELECT SUBSTR(date_receipt_num,1,10) AS year_month_day, SUBSTR(tr_time,1,2) AS time_hour, sales_amount FROM """ + GET_IMPALA_TB_NAME(year) + """ WHERE SUBSTR(date_receipt_num,1,10) IN %s """ % (target_date_tuple,) + """ ) view_tr_total_amount_by_dayofweek GROUP BY time_hour ORDER BY time_hour ASC """ ) df_by_weekofday = as_pandas(cur) conn.close() def calc_average_amount(row): return row.total_amount / row.date_count df_by_weekofday['total_amount'] = df_by_weekofday.apply(calc_average_amount,axis=1) df_by_weekofday.set_index('time_hour',inplace=True) cached_data = df_by_weekofday.to_dict() if bIsRealTimeUpdated == False: # Redis save cache value redis_io.write_transaction(REDIS_KEY, cached_data) # return cached_data
def agg_montly_total_amount_by_product_cate(year): # Redis read cache value REDIS_KEY = "monthly_total_amount_per_cate:{0}".format(year) cached_data = redis_io.read_transaction(REDIS_KEY) if cached_data != None: return ast.literal_eval(cached_data) conn = connect(host='salest-master-server', port=21050) cur = conn.cursor() cur.execute('USE salest') bIsRealTimeUpdated = REFRESH_IMPALA_TABLE(cur, year) cur.execute( """ SELECT SUBSTR(view_tr_receipt.date_receipt_num,1,7) AS year_month, view_tr_receipt.num_of_product, view_tr_receipt.sales_amount AS total_amount, ext_menumap_info.product_name, ext_menumap_info.cate_name, ext_menumap_info.price FROM (SELECT * FROM """ + GET_IMPALA_TB_NAME(year) + """ WHERE SUBSTR(date_receipt_num,1,4) = '""" + year + "'" + """) view_tr_receipt JOIN ext_menumap_info USING (product_code)""" ) df_tr_receipt_menumap = as_pandas(cur) conn.close() def aggregation(row): total_amount = row['total_amount'].sum() return pd.Series([total_amount], index=['total_amount']) df_monthly_product_tr = df_tr_receipt_menumap.groupby(['year_month','cate_name']).apply(aggregation) df_default = genDefaultMontlyCateTotalAmountDataFrame(df_monthly_product_tr,year, 'cate_name') df_all_monatly_sales_volume = pd.merge(df_default, df_monthly_product_tr, left_index=True, right_index=True, how='outer').fillna(0).sort_index(ascending='1') def post_aggregation(row): return row['total_amount_x'] + row['total_amount_y'] df_all_monatly_sales_volume['total_amount'] = df_all_monatly_sales_volume.apply(post_aggregation, axis=1) df_all_monatly_sales_volume.drop(['total_amount_x','total_amount_y'], axis=1, inplace=True) def gen_dict_total_amount(month_rows): monthlyDict = {} monthlyDictKey = month_rows.index.get_level_values('year_month')[0] monthCateItemsStr = "{" for item in zip(month_rows.index.get_level_values('cate_name'),month_rows['total_amount']): monthCateItemsStr += "'{0}':{1},".format(item[0],item[1]); monthCateItemsStr = monthCateItemsStr[:-1] monthCateItemsStr += "}" monthlyDict = ast.literal_eval(monthCateItemsStr) monthlyDict['year_month'] = month_rows.index.get_level_values('year_month')[0] return monthlyDict mothlyTotalAmountDictItems = df_all_monatly_sales_volume.groupby(df_all_monatly_sales_volume.index.get_level_values('year_month')).apply(gen_dict_total_amount) mothlyTotalAmountDict = {} mothlyTotalAmountList = [] for item in mothlyTotalAmountDictItems: mothlyTotalAmountList.append(item) mothlyTotalAmountDict['total_amount'] = mothlyTotalAmountList if bIsRealTimeUpdated == False: # Redis save cache value redis_io.write_transaction(REDIS_KEY, mothlyTotalAmountDict) # return mothlyTotalAmountDict
ptb_mothers = open('PTB1v4_mothers.txt', 'r').readlines() ptb_list = map(str.strip, ptb_mothers) ################## ## create queries ## ################## from impala.util import as_pandas ptb_query = ("SELECT * " "FROM p7_ptb.illumina_variant as ill, public_hg19.ensembl_genes as ens " "WHERE ens.gene_name IN ('" + "','".join(gene_list) + "') " + "AND ill.sample_id IN ('" + "','".join(ptb_list) + "') " + "AND ens.chromosome = ill.chr AND (ill.pos >= ens.start AND ill.pos <= ens.stop)" ) cur.execute(ptb_query) ptb_results = as_pandas(cur) ptb_results.to_csv("./ptb_variants_Ill.tsv", sep="\t") ptb_query = ("SELECT * " "FROM p7_ptb.comgen_variant as com, public_hg19.ensembl_genes as ens " "WHERE ens.gene_name IN ('" + "','".join(gene_list) + "') " + "AND com.sample_id IN ('" + "','".join(ptb_list) + "') " + "AND ens.chromosome = com.start AND (com.start >= ens.start AND com.start<= ens.stop)" ) cur.execute(ptb_query) ptb_results = as_pandas(cur) ptb_results.to_csv("./ptb_variants_com.tsv", sep="\t") ftb_query = ("SELECT * "
def queryEvents(eventLevels,whereClauses,fromRootThresholdPct,fromParentThresholdPct,appName,startEvent,startDate,endDate): print "This will take a few minutes" vID = 0 levelID = 1 if len(whereClauses) > 0: where = 'true ' for eachClause in whereClauses: where = where + "and " + eachClause #Query Impala query = """select distinct 0 as vID, 0 as levelID, 0 as parentVID, event_name, distinct_id from fact_beacon_history where %s and event_name = '%s' and app_name = '%s' and date_sid between '%s' and '%s' """ % (where, startEvent, appName, startDate, endDate) conn = connect(host='52.89.99.148', port=21050) cur = conn.cursor() t0 = datetime.datetime.utcnow() cur.execute(query) t1 = datetime.datetime.utcnow() #print t1-t0 treeData = as_pandas(cur) rootUserCount = len(treeData['distinct_id']) vID = 1 # parentDistinctIDs = pd.DataFrame(pd.unique(treeData['distinct_id'])).values while levelID < eventLevels: uniques = treeData[treeData['levelid'] == levelID - 1].drop_duplicates(subset=['event_name','vid'],take_last=True)[['event_name','vid']].values eventList = treeData[treeData['levelid'] == levelID - 1].drop_duplicates(subset=['event_name','vid'],take_last=True)[['event_name','vid']].values for eachEvent, pvid in eventList: #Query Impala query = """select distinct event_name from fact_beacon_history where %s and app_name = '%s' and prev_event = '%s' and date_sid between '%s' and '%s'""" % (where, appName, eachEvent, startDate, endDate) conn = connect(host='52.89.99.148', port=21050) cur = conn.cursor() t0 = datetime.datetime.utcnow() cur.execute(query) t1 = datetime.datetime.utcnow() #print t1-t0 events = as_pandas(cur) parentVID = pvid #parentVID = treeData[(treeData['levelid'] == levelID - 1) & (treeData['event_name'] == eachEvent)]['vid'].values[0] for eachSubEvent in events[events['event_name'] != startEvent]['event_name']: parentUserCount = len(treeData[(treeData['levelid'] == levelID - 1) & (treeData['event_name'] == eachEvent)]) parentRootConv = np.true_divide(parentUserCount,rootUserCount)*100 if parentRootConv > fromRootThresholdPct: query = """select distinct %s as vID, %s as levelID, %s as parentVID, event_name, distinct_id from fact_beacon_history where %s and app_name = '%s' and event_name = '%s' and prev_event = '%s' and date_sid between '%s' and '%s' """ % (vID, levelID, parentVID, where, appName, eachSubEvent, eachEvent, startDate, endDate) conn = connect(host='52.89.99.148', port=21050) cur = conn.cursor() t0 = datetime.datetime.utcnow() cur.execute(query) t1 = datetime.datetime.utcnow() addTreeData = as_pandas(cur) #parentDistinctIDs = treeData[(treeData['levelid'] == levelID - 1) & (treeData['event_name'] == eachEvent)]['distinct_id'].values parentDistinctIDs = treeData[treeData['vid'] == parentVID].values addRecord = False for eachID in addTreeData['distinct_id']: if eachID in parentDistinctIDs: treeData = pd.concat([treeData,addTreeData[addTreeData['distinct_id'] == eachID]]) addRecord = True tempGraphData = pd.DataFrame({'user_count' : treeData.groupby(['vid','levelid','parentvid','event_name'], as_index = False).size()}).reset_index() nodeUserCount = tempGraphData[tempGraphData['vid'] == max(tempGraphData['vid'])]['user_count'] fromParentConv = np.true_divide(nodeUserCount,parentUserCount).values*100 if addRecord == True and fromParentConv > fromParentThresholdPct: vID +=1 elif addRecord == True: treeData = treeData[treeData['vid'] != max(treeData['vid'])] levelID += 1 #print treeData return treeData
def head(self, row_count = 5): head_query = ('SELECT * FROM (%s) AS head_tbl LIMIT %d' % (self._query_ast.to_sql(), row_count)) self._ic._cursor.execute(head_query) return as_pandas(self._ic._cursor)
def get_most_popular_products(req_cate_name): # Redis read cache value REDIS_KEY = "product_category_info" def get_cache_value(req_cate_name): subKeyList = [] subKeyList.append(req_cate_name) return redis_io.read_dict_transaction(REDIS_KEY, subKeyList) # Redis read cache value #REDIS_KEY = "product_category_info" #cached_data = redis_io.read_dict_transaction(REDIS_KEY,req_cate_name) #if cached_data != None: # return ast.literal_eval(cached_data[0]) cached_data = get_cache_value(req_cate_name) if cached_data != None: return ast.literal_eval(cached_data[0]) conn = connect(host='salest-master-server', port=21050) cur = conn.cursor() cur.execute('USE salest') dict_product_cate_items = {} # Category Items queryStr = """ SELECT DISTINCT cate_name FROM ext_menumap_info """ cur.execute(queryStr) df_categories = as_pandas(cur) dict_product_cate_items['All'] = df_categories['cate_name'].values.tolist() # Most 10 papular items per each category for cate_name in dict_product_cate_items['All']: query_str = """ SELECT product_name, SUM(sales_amount) AS total_amount FROM ( SELECT cate_name,product_name,date_receipt_num,sales_amount FROM (SELECT * FROM ext_menumap_info WHERE cate_name = '""" + cate_name + """' ) view_specific_menu JOIN ext_tr_receipt USING (product_code) ) view_tr_specific_cate_menu GROUP BY (view_tr_specific_cate_menu.product_name) ORDER BY (SUM(sales_amount)) DESC LIMIT 10 """ cur.execute(query_str) df_papular_products = as_pandas(cur) df_papular_products = df_papular_products[df_papular_products.total_amount != 0] dict_product_cate_items[cate_name] = df_papular_products['product_name'].values.tolist() conn.close() # Redis save cache value redis_io.write_dict_transaction(REDIS_KEY, dict_product_cate_items, 60*60*24*30) # cached_data = get_cache_value(req_cate_name) return ast.literal_eval(cached_data[0])
def agg_montly_sales_volumn(year,unit_numofproduct, unit_totalamount): # Redis read cache value REDIS_KEY = "monthly_sales_vol:{0}".format(year) cached_data = redis_io.read_transaction(REDIS_KEY) if cached_data != None: return ast.literal_eval(cached_data) # conn = connect(host='salest-master-server', port=21050) cur = conn.cursor() cur.execute('USE salest') bIsRealTimeUpdated = REFRESH_IMPALA_TABLE(cur, year) cur.execute(""" SELECT year_month, SUM(num_of_product) AS num_of_product, SUM(sales_amount) AS total_amount FROM ( SELECT SUBSTR(date_receipt_num,1,7) AS year_month, num_of_product, sales_amount FROM """ + GET_IMPALA_TB_NAME(year) + """ WHERE SUBSTR(date_receipt_num,1,4) = '""" + year + """' ) view_tr_recipt GROUP BY year_month ORDER BY year_month ASC """ ) df = as_pandas(cur) conn.close() ### Fill non-included monthly row with zero base values. month_index_arr = [] for month in range(1,13): month_index_arr.append("{0}-{1:02d}".format(year,month)) df_base_index = pd.DataFrame(data=month_index_arr, columns=['year_month']) df_all_monatly_sales_volume = pd.merge(df, df_base_index, on='year_month',how='outer').fillna(0).sort_values(by='year_month',ascending='1') ### df_list = list(df_all_monatly_sales_volume.itertuples(index=False)) df_column_name_list = list(df.columns.values) list_month_sales_volume = [] dict_month_sales_volume = {} for row in df_list: dict_month_sales_volume = {} for key,value in zip(df_column_name_list, row): if(key=='num_of_product'): value = int(round(value / unit_numofproduct)) if(key=='total_amount'): value = int(round(value / unit_totalamount)) dict_month_sales_volume[key] = value list_month_sales_volume.append(dict_month_sales_volume.copy()) if bIsRealTimeUpdated == False: # Redis save cache value redis_io.write_transaction(REDIS_KEY, list_month_sales_volume) # return list_month_sales_volume
def agg_montly_total_amount_by_product(year, product_cate): # Redis read cache value REDIS_KEY = "monthly_total_amount_per_product:{0}:{1}".format(year,product_cate.encode("UTF-8")) cached_data = redis_io.read_transaction(REDIS_KEY) if cached_data != None: return ast.literal_eval(cached_data) # conn = connect(host='salest-master-server', port=21050) cur = conn.cursor() cur.execute('USE salest') bIsRealTimeUpdated = REFRESH_IMPALA_TABLE(cur, year) query_str = """ SELECT * FROM ( SELECT SUBSTR(view_tr_receipt.date_receipt_num,1,7) AS year_month, view_tr_receipt.num_of_product, view_tr_receipt.sales_amount AS total_amount, ext_menumap_info.product_name, ext_menumap_info.cate_name, ext_menumap_info.price FROM (SELECT * FROM """ + GET_IMPALA_TB_NAME(year) + """ WHERE SUBSTR(date_receipt_num,1,4) = '%s' ) view_tr_receipt JOIN ext_menumap_info USING (product_code) ) view_tr_receipt_menumap WHERE cate_name = '%s' """ % (year,product_cate) cur.execute(query_str.encode("UTF-8")) df_monthly_product_tr = as_pandas(cur) conn.close() column_func_tuple = [('total_amount','sum')] df_monthly_summary = df_monthly_product_tr.groupby(['year_month','product_name'])['total_amount'].agg(column_func_tuple) df_monthly_summary.rename(columns={'total_amount': 'total_amount_B'}, inplace=True) df_default = genDefaultMontlyCateTotalAmountDataFrame(df_monthly_summary,year, 'product_name') df_default.rename(columns={'total_amount': 'total_amount_A'}, inplace=True) df_per_category = pd.concat([df_default, df_monthly_summary], axis=1).fillna(0) def post_aggregation(row): return row[0] + row[1] df_per_category['total_amount'] = df_per_category.apply(post_aggregation, axis=1) df_per_category.drop(['total_amount_A','total_amount_B'],axis=1,inplace=True) # Overall Top 10 menu items in category df_topten_products_by_total_amount = df_monthly_product_tr.groupby(['product_name']).sum().sort_values(by='total_amount', ascending=False)[:10] df_topten_products_by_total_amount.drop(['num_of_product'],axis=1, inplace=True) df_topten_products_by_total_amount.rename(columns={'total_amount':'overall_total_amount'},inplace=True) # Redis save cache value redis_io.write_transaction(product_cate, df_topten_products_by_total_amount.index.tolist(), 60*60*24*7) # # Merge the above two dataframes df_new = df_per_category.reset_index(level=0) df_merged = pd.merge(df_new, df_topten_products_by_total_amount, left_index=True, right_index=True, how='left').sort_values(by='year_month', ascending=True) def agg_monthly_items_summary(row): sr_columns = row[row['overall_total_amount'].notnull()].index sr_values = row[row['overall_total_amount'].notnull()]['total_amount'] etcSum = row[row['overall_total_amount'].isnull()]['total_amount'].sum() sr_columns = sr_columns.insert(sr_columns.size,'ETC') sr_etc = pd.Series([etcSum], index=['ETC']) sr_values = sr_values.append(sr_etc) return pd.Series(sr_values, index=sr_columns) df_merged_new = df_merged.reset_index(level=0) df_agg_monthly_summary = df_merged.groupby(['year_month']).apply(agg_monthly_items_summary)#.unstack() df_agg_monthly_summary.fillna(0,inplace=True) monthlyDictItems = df_agg_monthly_summary.apply(gen_dict_total_amount,axis=1) mothlyTotalAmountDict = {} mothlyTotalAmountList = [] for item in monthlyDictItems: mothlyTotalAmountList.append(item) mothlyTotalAmountDict['total_amount'] = mothlyTotalAmountList if bIsRealTimeUpdated == False: # Redis save cache value redis_io.write_transaction(REDIS_KEY, mothlyTotalAmountDict) # return mothlyTotalAmountDict
from impala.dbapi import connect conn = connect(host='ec2-54-86-98-154.compute-1.amazonaws.com',port=21050) cur = conn.cursor() # Describe Kaviar cur.execute('use p7_ref_grch37') cur.execute('describe kaviar') print("Describing Kaviar fields...") print('\n'.join(' - '.join(elems) for elems in cur.fetchall())) # Getting Kaviar Data # this query retrieves chrom 8 p arm 0-23100000 print("Selecting data from kaviar for chromosome 8 pos 0-23100000") cur.execute('select chrom, pos, allele_freq from kaviar where chrom = "8"') #cur.execute('select chrom, pos, allele_freq from kaviar where chrom = "8" and pos >= 0 and pos <= 23100000') kaviar = as_pandas(cur) # this will take a relatively long time kaviar["chrom"] = 'chr' + kaviar["chrom"].astype(str) # need to add chr to the front to match the ideogram and ucsc_genes kaviar.to_csv("kaviar_results.txt") print("Finished saving kaviar data") kaviar["start"] = kaviar["pos"] kaviar["end"] = kaviar["pos"] kaviar["width"] = kaviar.end - kaviar.start +10 kaviar['colors'] = '#43a822' # green # bin kaviar data binned_kaviar = pd.DataFrame(columns=('chrom','start','end','avg','max_af','min_af')) for chrom, group in kaviar.groupby('chrom'): width = group.shape[0] binsize = 10000 num_bins = width/binsize
from impala.dbapi import connect from impala.util import as_pandas import traceback select = "SELECT * FROM tab LIMIT 100" try: conn = connect(host='your_host', port=21050) cursor = conn.cursor() try: cursor.execute(select) print cursor.description # prints the result set's schema df = as_pandas(cursor) print df df.to_csv(path_or_buf='C:/Users/curycu/workspace/select.txt', index=False) # save as text file except: print "Error getting list of tables." print traceback.format_exc() cursor.close() except: print "Error establishing connection to Impala."