コード例 #1
2
ファイル: Qrys.py プロジェクト: P1R/RHPy
 def MaxTE(self, Cursor, Date, Limit=100):
     '''Calcula Maximo de precio en fila
     debe llevar fecha y limite
     '''
     Qry = """
     select id_ofertas_energia_david,
     hora,
     central,
     precio_per_mw_1,
     precio_per_mw_2,
     precio_per_mw_3,
     precio_per_mw_4,
     precio_per_mw_5,
     precio_per_mw_6,
     precio_per_mw_7,
     precio_per_mw_8,
     precio_per_mw_9,
     precio_per_mw_10,
     precio_per_mw_11
     from ofertas_energia_david 
     where tipo_reporte='TE' and fecha_inicial='{0}' limit {1}
     """.format(Date, Limit)
     Cursor.execute(Qry)
     #da formato pandas
     df = as_pandas(Cursor)
     #pasando a formato R y a R
     df = com.convert_to_r_dataframe(df)
     #print type(rdf)
     ro.r('source("./Rfunctions/max.R")')
     ro.globalenv['tabla'] = df
     ro.r('Out <- Rmax(tabla)')
     print ro.r('Out')
コード例 #2
0
 def test_zBizRule2(self):
     cs = self.icon.cursor()
     try:
         cs.execute(
             """ select max(order_date) as maxd from db.fct_total_order_trans """
         )
         df = as_pandas(cs)
         maxd = str(df["maxd"][0])
         lastyeard = TableTestCase.TableTestCase.GetLastYearSameDay(maxd)
         sql = """select a.store_key,a.t_amt/b.t_amt from 
             (select sum(real_amt) t_amt,store_key 
             from db.fct_total_order_trans where order_date='%s' and data_source='store' group by store_key) a 
             join 
             (select sum(real_amt) t_amt,store_key from db.fct_total_order_trans where order_date='%s' group by 
             store_key) b on a.store_key=b.store_key 
             where a.t_amt/b.t_amt is not null and (a.t_amt/b.t_amt>3 or a.t_amt/b.t_amt <0.3)""" % (
             maxd, lastyeard)
         cs.execute(sql)
         df = as_pandas(cs)
         rs = len(df)
         if rs > 20:
             self.raiseFailure("全渠道交易表:去年同日的销售量对比,发现差异过大的数据。<br />%s" %
                               df.to_html())
         if rs in range(1, 21):
             self.raiseError("全渠道交易表:去年同日的销售量对比,发现差异过大的数据。<br />%s" %
                             df.to_html())
     finally:
         cs.close()
コード例 #3
0
def api_payrate_zhexian():
    now_time, seven_days_time = get_now_and_7days_time()
    if 'provinceid' in request.args and 'cityid' in request.args and "areaid" in request.args and "shopid" not in request.args:
        sql=get_sql("sql","sql_payrate_zhexian1")\
            .replace("get_provinced_id",request.args['provinceid'])\
            .replace("get_city_id",request.args['cityid'])\
            .replace("get_area_id",request.args['areaid'])\
            .replace("seven_days_time","'"+seven_days_time+"'")\
            .replace("now_time","'"+now_time+"'")
        print (sql)
        cur.execute(sql)
        df = as_pandas(cur)
        res = trans_array(df)
        return res
    if 'provinceid' in request.args and 'cityid' in request.args and "shopid" not in request.args:
        sql = get_sql("sql", "sql_payrate_zhexian2") \
            .replace("get_provinced_id", request.args['provinceid']) \
            .replace("get_city_id", request.args['cityid']) \
            .replace("seven_days_time", "'"+seven_days_time+"'") \
            .replace("now_time", "'"+now_time+"'")
        #print (sql)
        cur.execute(sql)
        df = as_pandas(cur)
        res = trans_array(df)
        return res
    if 'provinceid' in request.args and "shopid" not in request.args:
        sql = get_sql("sql", "sql_payrate_zhexian3") \
            .replace("get_provinced_id", request.args['provinceid']) \
            .replace("seven_days_time", "'"+seven_days_time+"'") \
            .replace("now_time", "'"+now_time+"'")
        #print (sql)
        cur.execute(sql)
        df = as_pandas(cur)
        res = trans_array(df)
        return res
    if 'shopid' in request.args :
        sql = get_sql("sql", "sql_payrate_zhexian4") \
            .replace("shop__", request.args['shopid']) \
            .replace("seven_days_time", "'" + seven_days_time + "'") \
            .replace("now_time", "'" + now_time + "'")
        #print (sql)
        cur.execute(sql)
        df = as_pandas(cur)
        res = trans_array(df)
        return res
    else:
        sql = get_sql("sql", "sql_payrate_zhexian5") \
            .replace("seven_days_time", "'"+seven_days_time+"'") \
            .replace("now_time", "'"+now_time+"'")
        print (sql)
        cur.execute(sql)
        df = as_pandas(cur)
        res = trans_array(df)
        return res
コード例 #4
0
def api_payrate_bing():
    yes_time = get_yes_time()
    if 'provinceid' in request.args and 'cityid' in request.args and "areaid" in request.args and "shopid" not in request.args:
        sql=get_sql("sql","sql_payrate1")\
            .replace("province_",request.args['provinceid'])\
            .replace("city_",request.args['cityid'])\
            .replace("area_",request.args['areaid']) \
            .replace("yes_time", "'" + yes_time + "'")
        #print (sql)
        cur.execute(sql)
        df = as_pandas(cur)
        res = trans_array3(df)
        return res
    if 'provinceid' in request.args and 'cityid' in request.args and "shopid" not in request.args:
        sql = get_sql("sql", "sql_payrate2") \
            .replace("province_", request.args['provinceid']) \
            .replace("city_", request.args['cityid']) \
            .replace("yes_time", "'" + yes_time + "'")
        #print (sql)
        cur.execute(sql)
        df = as_pandas(cur)
        res = trans_array3(df)
        return res
    if 'provinceid' in request.args and "shopid" not in request.args:
        sql = get_sql("sql", "sql_payrate3") \
            .replace("province_", request.args['provinceid']) \
            .replace("yes_time", "'" + yes_time + "'")
        #print (sql)
        cur.execute(sql)
        df = as_pandas(cur)
        res = trans_array3(df)
        return res
    if 'shopid' in request.args :
        sql = get_sql("sql", "sql_payrate4") \
            .replace("shop__", request.args['shopid']) \
            .replace("yes_time", "'" + yes_time + "'")
        #print (sql)
        cur.execute(sql)
        df = as_pandas(cur)
        res = trans_array3(df)
        return res

    else:
        sql = get_sql("sql", "sql_payrate5") \
            .replace("yes_time", "'" + yes_time + "'")
        #print (sql)
        cur.execute(sql)
        df = as_pandas(cur)
        res = trans_array3(df)
        return res
コード例 #5
0
ファイル: MySQLFunctions.py プロジェクト: Aziko13/DB_classes
def main():
    # Loading data from MySQL as an example
    conn = MySQLConnection()
    # in the query below table_name has to be replaced to some existing table
    query = '''
            select * from table_name limit 10
            '''
    conn.execute(query)

    res = as_pandas(conn.cur)
    print(res.head())

    res.loc[res.shape[0]] = [None, 'TEST']

    print('Table is loaded')

    # Loading Table into MySQL (another scheme)
    # in the query below schema_name has to be replaced to some existing schema
    conn.loadTableIntoMySQL(res,
                            table='schema_name.test_table',
                            replace=True,
                            print_q=False)

    # Deleting created table
    conn.execute('drop table if exists schema_name.test_table')

    print('Done')
コード例 #6
0
def ExecuteHiveSQL(sql, user, password, host, port):
    try:
        impala_logger = logging.getLogger('impala')

        #Enable Only CRITICAL Logger
        impala_logger.setLevel(logging.CRITICAL)

        ConnectionErrors = (Error, DatabaseError, InternalError,
                            OperationalError, ProgrammingError, IntegrityError,
                            DataError, NotSupportedError)

        # Establish Hive connection
        conn = getHiveConnection(user, password, host, int(port))
        cursor = conn.cursor()
        cursor.execute(sql)
        df_output = as_pandas(cursor)
        conn.close()

        return df_output
    except ConnectionErrors as e:
        logging.error("ExecuteHiveSQL(): Database Error - " + str(e))
        print(e)
        raise
    except Exception as e:
        logging.error("ExecuteHiveSQL(): Failed - " + str(e))
        print(e)
コード例 #7
0
def impala_query(sql):

    impala_HMS_HOST=os.getenv('IMPALA_HOST','url')
	impala = connect(host=impala_HME_HOST,
	port=21050,
	use_ssl=False,
	auth_mechanism='GSSAPI',
	kerberos_service_name='impala')
	
	proc_start = time.time()
	# Time Check
	#--------------------------
	impala_cursor = impala.cursor()
	impala_cursor.execute(sql)
	df = as_pandas(impala_cursor)
	#--------------------------
	proc_end = time.time()


	columns = df.shape[1]
	nrows=len(df)
	
	# Calculation
	#--------------------------
	memory_usage = round(sum(df.memory_usage(deep=True))/1024**2,6)
	read_time = round(proc_end - proc_start,6)
	
	impala.close()
	return df
コード例 #8
0
def run_sql(sql, username, password):
    conn = connect(host='10.2.8.96',
                   auth_mechanism='PLAIN',
                   port=21050,
                   user=username,
                   password=password)
    cursor = conn.cursor()
    cnt = 1

    if ';' in sql:
        sql_list = sql.rstrip().split(';')
        # print(type(sql_list))
        if len(sql_list[-1]):
            for s in sql_list:
                print("runing sql @ %s" % cnt)
                cursor.execute(s)
                cnt += 1
        else:
            sql_list.pop()
            for s in sql_list:
                # logging.info('executing sql'+s)

                print("runing sql @ %s" % cnt)
                cursor.execute(s)
                cnt += 1
    else:
        print("runing sql @ %s" % cnt)
        cursor.execute(sql)

    return as_pandas(cursor) if cursor.description != None else 'null'
コード例 #9
0
ファイル: OracleFunctions.py プロジェクト: Aziko13/DB_classes
def main():
    conn = OracleConnection()
    # in the query below table_name has to be replaced to some existing table
    query = '''
            select * from table_name where rownum <= 5
            '''
    conn.execute(query)

    res = as_pandas(conn.cur)
    print(res.head(3))
    print('Table is here')

    res.loc[res.shape[0]] = [None, 1223, 2232, 3232, 'TEST']

    # in the query below schema_name has to be replaced to some existing schema
    conn.loadTableIntoOracle(res,
                             table='schema_name.test_table',
                             replace=True,
                             print_q=False)

    # Deleting created table

    conn.execute('drop table schema_name.test_table')
    conn.execute('commit')

    print('Done')
コード例 #10
0
 def run_query(self, query):
     with dbclient(pyodbc_connect(self.dc, self.database)) as cursor:
         res = cursor.execute(self.query)
         getlogger().debug(
             '\n>>>>>>>>>>>>>>>>type of cursor.execute is {}'.format(
                 type(res)))
         return as_pandas(res)
コード例 #11
0
def hive_query(sql):

    HIVE_HMS_HOST = os.getenv('HIVE_HS2_HOST','url')
	hive = connect(host=HIVE_HMS_HOST,
	port=10000,
	use_ssl=False,
	auth_mechanism='GSSAPI',
	kerberos_service_name='hive')
	
	proc_start = time.time()
	# Time Check
	#--------------------------
	hive_cursor = hive.cursor()
	hive_cursor.execute(sql)
	df = as_pandas(hive_cursor)
	#--------------------------
	proc_end =time.time()


	columns = df.shape[1]
	nrows=len(df)
	
	# Calculation
	#--------------------------
	memory_usage = round(sum(df.memory_usage(deep=True))/1024**2,6)
	read_time = round(proc_end - proc_start,6)
	
	hive.close()
	return df
コード例 #12
0
def taonewslabel():
    pipe = rds.pipeline()
    #key_list = rds.keys('*')
    #for key in key_list:
    #    if "taonewslabel_" in key:
    #        pipe.delete(key)
    #pipe.execute()
    #pipe = rds.pipeline()
    #print "taonewslabel  delete"
    cur.execute(
        "select userid,GROUP_CONCAT(cast(type as string)) as tags  from reports.taonew_label_user where logday='%s' group by userid;"
        % yestoday)
    df = as_pandas(cur)
    dict1 = dict()
    dict1['error_code'] = 0
    for index, row in df.iterrows():
        dict1['userid'] = row['userid']
        list = row['tags'].split(',')
        list = map(int, list)
        dict1['tags'] = list
        json_str = json.dumps(dict1)
        pipe.set('taonewslabel_' + row['userid'], json_str)
        if index % 1000 == 0:
            pipe.execute()
    pipe.execute()
    print "taonewslabel finished."
コード例 #13
0
def run_sql(sql):
    conn = connect(host='172.17.69.25',
                   auth_mechanism='PLAIN',
                   port=21050,
                   user='******',
                   password='******')
    cursor = conn.cursor()
    cnt = 1

    # 2.2 对impala执行SQL查询
    if ';' in sql:
        sql_list = sql.rstrip().split(';')
        # print(type(sql_list))
        if len(sql_list[-1]):
            for s in sql_list:
                print("runing sql @ %s" % cnt)
                cursor.execute(s)
                cnt += 1
        else:
            sql_list.pop()
            for s in sql_list:
                print("runing sql @ %s" % cnt)
                cursor.execute(s)
                cnt += 1
    else:
        print("runing sql @ %s" % cnt)
        cursor.execute(sql)
    # 2.3 把结果转化成pandas的dataframe格式,以便进行数据分析
    # df = as_pandas(cursor)
    # print(df)
    return as_pandas(cursor) if cursor.description != None else 'null'
コード例 #14
0
ファイル: impala_sql.py プロジェクト: zhangzhenhu/prophet
def aistudy_knowledge_question(param=None):
    if param is None:
        param = {'year': '2018', 'term_id': '2', 'subject_id': '', 'grade_id': '7', 'city_id': '020'}
    sql = """
        select 
          -- ad.cuc_num,
          -- cla.knowledge_id,
          -- cla.knowledge_name,ad
          ad.problem_id as item_id,
          ad.difficulty as difficulty,
          ad.discrimination,
          ad.new_difficulty
          -- ad.isdel
          --cla.id,
          -- cla.create_time
          -- rank() over(partition by cla.knowledge_id,cla.knowledge_name order by cla.id) as order_num
        from odata.ods_ai_study_ad_paper_question ad
       --join  odata.ods_ai_study_ad_cla_knowledge cla on ad.knowledge_id=cla.knowledge_id

    """ % param
    today = datetime.today().strftime("%Y-%m-%d")
    file_name = "item_profile_%s.pkl" % today
    if os.path.exists(file_name):
        return pd.read_pickle(file_name)
    impala_cursor.execute(sql)
    df = as_pandas(impala_cursor)
    df.to_pickle(file_name)
    return df
コード例 #15
0
ファイル: Qrys.py プロジェクト: P1R/RHPy
 def MaxMonth(self, Cursor, Month, Units, Limit=1):
     '''get maximum of month by units'''
     File=open('MaxMonth.csv','w')
     File.write("unidad, fecha, hora, costo_maximo\n")
     for item in Units:
         Qry = """
         select * from(
         select unidad,
         fecha_inicial,
         hora,precio_per_mw_10,
         max(precio_per_mw_10) over
         (partition by unidad order by fecha_inicial,
         hora asc rows between unbounded preceding and unbounded following )
         as costo_maximo from ofertas_energia_david
         where (tipo_reporte='TE' and unidad='{1}' 
         and month(fecha_inicial)={0} and precio_per_mw_10 > 0)
         ) data where data.costo_maximo=precio_per_mw_10 
         and data.costo_maximo > 0 limit {2}
         """.format(Month, item, Limit)
         Cursor.execute(Qry)
         #da formato pandas
         df = as_pandas(Cursor)
         print df
         try:
             File.write("{0}, {1}, {2}, {3}\n".format(str(df['data.unidad'][0]),
                 str(df['data.fecha_inicial'][0]),
                 str(df['data.hora'][0]),
                 str(df['data.costo_maximo'][0])))
         except IndexError:
             print item + " vacio"
             pass
     File.close()
コード例 #16
0
def impala_query(sql):
    conn = connect(**impala_config)
    cur = conn.cursor()
    cur.execute(sql)
    df = as_pandas(cur)
    conn.close()
    return df
コード例 #17
0
 def test(self):
     """测试"""
     cursor = self.conn.cursor()
     cursor.execute('show tables like "dm*_sdr_*dm"')
     df = as_pandas(cursor)
     data = df.to_dict('list')
     print(data)
コード例 #18
0
def impala_connect(sql, **kwargs):
    # impala
    host = kwargs.get("host", 'impala.bjds.belle.lan')
    port = kwargs.get("port", 21051)
    timeout = kwargs.get("timeout", 3600)
    # hive
    # host = kwargs.get("host", 'impala.bjds.belle.lan')
    # port = kwargs.get("port", 10008)
    # timeout = kwargs.get("timeout", 3600)
    user = kwargs.get("user", "lv.d.sz")
    password = kwargs.get("password", 'JHjLXpyQ')
    kerberos_service_name = kwargs.get("kerberos_service_name", "impala")
    conn = connect(host=host,
                   port=port,
                   timeout=timeout,
                   user=user,
                   password=password,
                   kerberos_service_name=kerberos_service_name,
                   auth_mechanism='LDAP')
    cur = conn.cursor(user=user)
    if sql is not None:
        cur.execute(sql)
        try:
            df = as_pandas(cur)
        except:
            return cur
    return df
コード例 #19
0
def getData(sql):
    '''作用:根据输入的日期,获取特征数据'''
    con = connect(**INCEPTOR_CONFIG)
    cur = con.cursor()
    cur.execute(sql)
    df_data = as_pandas(cur)
    cur.close()
    return df_data
コード例 #20
0
ファイル: extract_flag.py プロジェクト: mazhen2010/ml
def impala_db(hive_sql):
    conn = connect(host=get_config("impala", "host"),
                   port=get_config("impala", "port"),
                   database=get_config("impala", "database"),
                   auth_mechanism='PLAIN')
    curl = conn.cursor()
    curl.execute(hive_sql)
    return as_pandas(curl)
コード例 #21
0
    def read_df(self, statement):

        self.create_connect()
        self.cursor.execute(statement)
        df = as_pandas(self.cursor)
        self.close_connect()

        return df
コード例 #22
0
def report():

    cursor = get_hive_cursor()

    if cursor is None:
        return render_template('/main/bi_connection_issue.html')

    # FIXME we probably want to create aggregates on hadoop
    #       and cache them rather than returning the whole data
    #       set here

    # we need to ignore monitoring pings which have rating user_id = -1 
    # and movie_id = -1
    try:
        cursor.execute(
            "select * from movie_ratings where customer_id <> '-1' and movie_id <> '-1'", 
            configuration={ 
                'hive.mapred.supports.subdirectories': 'true', 
                'mapred.input.dir.recursive': 'true' 
                })
    except:
        return render_template('/main/bi_connection_issue.html')

    df = as_pandas(cursor)
    
    count = df.shape[0]

    if count == 0:
       return render_template('/main/bi_no_records.html')

    from bokeh.charts import Bar, output_file, show

    fig = Bar(
            df,
            label='movie_ratings.rating',
            values='movie_ratings.rating',
            agg='count',
            title='Distribution of movie ratings',
            legend=False
            )


    fig.plot_height = 400
    fig.xaxis.axis_label = 'Rating'
    fig.yaxis.axis_label = 'Count ( Rating )'

    js_resources = INLINE.render_js()
    css_resources = INLINE.render_css()

    script, div = components(fig)
    html = flask.render_template(
        '/main/embed.html',
        plot_script=script,
        plot_div=div,
        js_resources=js_resources,
        css_resources=css_resources,
    )
    return encode_utf8(html)
コード例 #23
0
ファイル: db_impala.py プロジェクト: mantoujiaozi/Learning
 def select_sql_df(self, sql):
     """
     查找,返回一个Dataframe
     :param sql:
     :return:
     """
     self.cur.execute(sql)
     df = as_pandas(self.cur)
     return df
コード例 #24
0
ファイル: HiveClient.py プロジェクト: hanle1/xmtpcb
def dhive():
    # s ='create external table aoi_test(panelId string,sn string,lineCode string,stationCode string,deviceId string,result string,AOI_errorCode map<string,string>, AOI_info map<string,string>,AOI_tbTag string,AOI_singleTag string,AOI_workOrder string,AOI_data string,AOI_time string,AOI_time_cnt_num int)  ROW FORMAT SERDE \'org.openx.data.jsonserde.JsonSerDe\' STORED AS TEXTFILE location \'/pcb/AOI/\';
'
    from impala.dbapi import connect
    from impala.util import as_pandas
    conn = connect(host='10.141.212.26', port=10000, database='pcb',auth_mechanism='PLAIN')
    cur = conn.cursor()
    cur.execute('select * from aoi WHERE AOI_time_cnt_num = 1')
    df = as_pandas(cur)
    print(df)
コード例 #25
0
ファイル: hive_read_write.py プロジェクト: penglee87/lpython
 def read_to_dataframe(self, sql):
     try:
         from impala.util import as_pandas
         self.cursor = self.conn.cursor()
         self.cursor.execute(sql)
         self.df = as_pandas(self.cursor)
     except:
         raise
     return(self.df)
コード例 #26
0
def select_dataframe(sql):
    """
    Return a pandas data frame with the results of a sql query to impala
    :param sql:
    :return:
    """
    cur = impala_conn.cursor()
    cur.execute(sql)
    return as_pandas(cur)
コード例 #27
0
ファイル: utils.py プロジェクト: linpelvis/arthas
def _get_normalization_data(sql):
    cur = impala_cli.cursor(sql, True)
    df = as_pandas(cur)

    ret = {}
    ret['time'] = df['c_date'].tolist()
    ret['datas'] = __parse_data(df)
    logger.info('_get_normalization_data data:%s', ret)
    return ret
コード例 #28
0
    def test_zBizRule6(self):
        cs = self.icon.cursor()
        try:
            cs.execute(
                """ select distinct order_date from db.fct_total_order_trans where data_source = 'bs' 
                           and order_date >='2018-09-01'""")
            df = as_pandas(cs)
            rs = df["order_date"].tolist()
            breakdates = TableTestCase.TableTestCase.ConsecutivenessDateTest(
                rs, "%Y-%m-%d")
            if len(breakdates) > 0:
                self.raiseFailure("全渠道交易表官网数据存在中断,在%s和%s之间 " %
                                  (breakdates[0], breakdates[1]))

            cs.execute(
                """select distinct order_date from db.fct_total_order_trans where data_source = 'tmall'"""
            )
            df = as_pandas(cs)
            rs = df["order_date"].tolist()
            breakdates = TableTestCase.TableTestCase.ConsecutivenessDateTest(
                rs, "%Y-%m-%d")
            if len(breakdates) > 0:
                self.raiseFailure("全渠道交易表Tmall数据存在中断,在%s和%s之间 " %
                                  (breakdates[0], breakdates[1]))

            cs.execute(
                """select distinct order_date from db.fct_total_order_trans where data_source like '%o2o%'
                          and order_date >='2016-09-01'""")
            df = as_pandas(cs)
            rs = df["order_date"].tolist()
            breakdates = TableTestCase.TableTestCase.ConsecutivenessDateTest(
                rs, "%Y-%m-%d")
            if len(breakdates) == 0:
                pass
            # Tmall switch on 2018/5/16
            elif len(breakdates) in (2, 3) and str(breakdates[0]).strip() == '2018-05-15 00:00:00' \
                    and str(breakdates[1]).strip() == '2018-05-17 00:00:00':
                pass
            else:
                self.raiseFailure("全渠道交易表O2O数据存在中断,在 %s " %
                                  ','.join([str(s) for s in breakdates]))
        finally:
            cs.close()
コード例 #29
0
 def hive2dataframe(sql_context):
     conn = connect(host='192.168.1.73',
                    port=10000,
                    auth_mechanism='PLAIN',
                    user='******',
                    password='******',
                    database='ods')
     cursor = conn.cursor()
     cursor.execute(sql_context)
     return as_pandas(cursor)
コード例 #30
0
def monitor():

    auth = request.authorization
    if not auth or not check_auth(auth.username, auth.password):
        data = { "error": "Permission denied." }
        response = app.response_class(
            response=json.dumps(data),
            status=550,
            mimetype='application/json'
        )
        return response

    cursor = get_hive_cursor()

    if cursor is None:
        data = { "error": "Could not connect to Hive" }
        response = app.response_class(
            response=json.dumps(data),
            status=500,
            mimetype='application/json'
        )
        return response

    timestamp = time.time()

    message = '{0},{1},{2}'.format(-1, -1, timestamp)
    messagehub_client.send_message( message ) 

    time.sleep(70)

    cursor.execute(
            'select * from movie_ratings where rating = {0}'.format(timestamp), 
            configuration={ 
                'hive.mapred.supports.subdirectories': 'true', 
                'mapred.input.dir.recursive': 'true' 
                })
    df = as_pandas(cursor)
    count = df.shape[0]

    if count == 1:
        data = { "ok": "App rating found in hadoop." }
        response = app.response_class(
            response=json.dumps(data),
            status=200,
            mimetype='application/json'
        )
        return response
    else:
        data = { "error": "App rating not found in hadoop." }
        response = app.response_class(
            response=json.dumps(data),
            status=500,
            mimetype='application/json'
        )
        return response
コード例 #31
0
def impala_run_and_upload(query_string, metric_name):
    """
    Connects to Avvo production cluster and retrieves dataframe
    :param str query_string: an impala query to be run
    :rtype pd.DataFrame
    :return: dataframe with results
    """
    if not isinstance(query_string, str):
        raise TypeError('query_string must be a string!')
    conn = connect(host='dn1wow.prod.avvo.com',
                   port=21050,
                   database='tmp_data_dm',
                   auth_mechanism="GSSAPI",
                   kerberos_service_name='impala')
    cur = conn.cursor()
    cur.execute(query_string)
    output_df = as_pandas(cur)

    drop_table = 'drop table if exists rd_%s_temp' % metric_name
    cur.execute(drop_table)

    # for finding the date column
    headers = output_df.head()
    matching = [s for s in headers if 'date' in s]
    date_column = matching[0]

    print 'creating %s temp table \n' % metric_name
    write_query_string = "create table rd_%s_temp as %s;" % (metric_name,
                                                             query_string)
    cur.execute(write_query_string)

    time.sleep(2)
    # create the main table if it doesnt exist
    print 'creating %s table \n' % metric_name
    create_table = 'CREATE TABLE if not exists rd_%s as ( select * from rd_%s_temp order by 1)' \
                   % (metric_name, metric_name)
    cur.execute(create_table)

    time.sleep(2)
    print 'updating %s table \n' % metric_name
    query_string = "with new_data as (select * from rd_%s where \
                    %s not in (select %s from rd_%s_temp)) \
                    insert into rd_%s_temp select * from new_data order by %s" % (
        metric_name, date_column, date_column, metric_name, metric_name,
        date_column)

    cur.execute(query_string)
    time.sleep(2)
    replace_query = "with new_data as (select * from rd_%s_temp) \
                    insert overwrite rd_%s select * from new_data order by %s" % (
        metric_name, metric_name, date_column)
    cur.execute(replace_query)

    conn.close()
    return output_df
コード例 #32
0
ファイル: impselect.py プロジェクト: genichyar/impselect
    def __execute(self, sql, ret='pandas'):
        ret_val = None
        with connect(**self.connection) as con:
            cur = con.cursor()
            cur.execute(sql)
            if ret == 'pandas':
                ret_val = as_pandas(cur)
            elif ret == 'status':
                ret_val = cur.status()

        return ret_val
コード例 #33
0
ファイル: impala_cli.py プロジェクト: linpelvis/arthas
def main():
    HOST='127.0.0.1'
    PORT=21050
    SQL = 'select * from xxx limit 1'
    try:
        impala_cli = Impala(HOST, PORT)
        cur = impala_cli.cursor(SQL, True)
        df = as_pandas(cur)
        print df.test.describe()
    except Exception as e:
        print e.message;
コード例 #34
0
 def test_zBizRule9(self):
     cs = self.icon.cursor()
     try:
         cs.execute(
             """ select count(*) ct from db.fct_total_order_trans where customer_key is null """
         )
         df = as_pandas(cs)
         ct = int(df["ct"][0])
         if ct > 0:
             self.raiseFailure("全渠道交易表:存在空的customer_key")
     finally:
         cs.close()
コード例 #35
0
def run_sql(sql, user, pswd):
    conn = connect(host='10.2.8.91',
                   auth_mechanism='PLAIN',
                   port=21050,
                   user=user,
                   password=pswd)
    cursor = conn.cursor()

    cursor.execute(sql)
    # df = as_pandas(cursor)
    # print(df)
    return as_pandas(cursor) if cursor.description != None else 'null'
コード例 #36
0
ファイル: flow_oa.py プロジェクト: cgiraldo/incubator-spot
    def _ingest_summary(self):
        # get date parameters.
        yr = self._date[:4]
        mn = self._date[4:6]
        dy = self._date[6:]

        self._logger.info("Getting ingest summary data for the day")
        
        ingest_summary_cols = ["date","total"]		
        result_rows = []        
        df_filtered =  pd.DataFrame()

        # get ingest summary.

        query_to_load=("""
                SELECT tryear, trmonth, trday, trhour, trminute, COUNT(*) as total
                FROM {0}.{1} WHERE y={2} AND m={3} AND d={4}
                AND unix_tstamp IS NOT NULL
                AND sip IS NOT NULL
                AND sport IS NOT NULL
                AND dip IS NOT NULL
                AND dport IS NOT NULL
                AND ibyt IS NOT NULL
                AND ipkt IS NOT NULL
                AND tryear={2}
                AND cast(treceived as timestamp) IS NOT NULL
                GROUP BY tryear, trmonth, trday, trhour, trminute;
        """).format(self._db,self._table_name, yr, mn, dy)
        
        results = impala.execute_query(query_to_load) 
 
        if results:
            df_results = as_pandas(results) 
            
            #Forms a new dataframe splitting the minutes from the time column
            df_new = pd.DataFrame([["{0}-{1}-{2} {3}:{4}".format(val['tryear'],val['trmonth'],val['trday'], val['trhour'], val['trminute']), int(val['total']) if not math.isnan(val['total']) else 0 ] for key,val in df_results.iterrows()],columns = ingest_summary_cols)
            value_string = ''
            #Groups the data by minute 

            sf = df_new.groupby(by=['date'])['total'].sum()
            df_per_min = pd.DataFrame({'date':sf.index, 'total':sf.values})
            
            df_final = df_filtered.append(df_per_min, ignore_index=True).to_records(False,False) 
            if len(df_final) > 0:
                query_to_insert=("""
                    INSERT INTO {0}.flow_ingest_summary PARTITION (y={1}, m={2}, d={3}) VALUES {4};
                """).format(self._db, yr, mn, dy, tuple(df_final))

                impala.execute_query(query_to_insert)
                
        else:
            self._logger.info("No data found for the ingest summary")
コード例 #37
0
ファイル: bdf.py プロジェクト: fkaufer/impyla
    def take(self, n):
        """Return `n` rows as a pandas `DataFrame`

        Distributed and no notion of order, so not guaranteed to be
        reproducible.
        """
        alias = _random_id('inline_', 4)
        table_ref = InlineView(self._query_ast.to_sql(), alias)
        select_list = [SelectItem(table_name=TableName(table_ref.name))] # SELECT alias.*
        limit_elt = LimitElement(Literal(n), None)
        ast = SelectStmt(select_list, table_ref, limit=limit_elt)
        bdf = BigDataFrame(self._ic, ast)
        return as_pandas(bdf.__iter__())
コード例 #38
0
def desc_total_sales_volumn(year):
    
    # Redis read cache value
    REDIS_KEY = "desc_total_sales_vol:{0}".format(year)
    cached_data = redis_io.read_transaction(REDIS_KEY)
    
    if cached_data != None:
        return ast.literal_eval(cached_data)
    #
    
    conn = connect(host='salest-master-server', port=21050)
    cur = conn.cursor()

    # daily transaction agg
    cur.execute('USE salest')
    
    bIsRealTimeUpdated = REFRESH_IMPALA_TABLE(cur, year)
        
    cur.execute("""
        SELECT year_month_day, SUM(num_of_product) AS num_of_product, SUM(sales_amount) AS total_amount
        FROM (
            SELECT SUBSTR(date_receipt_num,1,10) AS year_month_day, num_of_product, sales_amount
            FROM """ + GET_IMPALA_TB_NAME(year) + """ WHERE SUBSTR(date_receipt_num,1,4) = '""" + year +
        """'
        ) view_tr_recipt
        GROUP BY year_month_day ORDER BY year_month_day ASC
        """
    )

    df_tr_agg_daily = as_pandas(cur)
    conn.close()
    
    series_sum = df_tr_agg_daily[['num_of_product','total_amount']].sum()
    series_sum.name = 'sum'

    df_desc = df_tr_agg_daily.describe().append(series_sum)
    df_desc['num_of_product'] = df_desc['num_of_product'].apply(lambda v: round(v))
    df_desc['total_amount'] = df_desc['total_amount'].apply(lambda v: round(v))
    
    df_desc.fillna(0, inplace=True)
    
    cached_data = df_desc.to_dict()
    
    if bIsRealTimeUpdated == False:
        # Redis save cache value
        redis_io.write_transaction(REDIS_KEY, cached_data)
        #
        cached_data = redis_io.read_transaction(REDIS_KEY)
        cached_data = ast.literal_eval(cached_data)
    
    return cached_data
コード例 #39
0
def get_timebase_data_on_past_specific_date(cur_date):
    
    # Redis read cache value
    REDIS_KEY_PREFIX = "past_timebase_data_of"
    
    def get_cache_value(cur_date):
        return redis_io.read_dict_transaction(REDIS_KEY_PREFIX + ":" + cur_date) 
    
    cached_data = get_cache_value(cur_date)
    if cached_data != None:
        return cached_data
    
    
    conn = connect(host='salest-master-server', port=21050)
    cur = conn.cursor()

    cur.execute('USE salest')
    
    date_list = tuple(get_past_target_date(cur_date))

    cur.execute(
    """
        SELECT time_hour, CAST(SUM(sales_amount) as INTEGER) AS total_amount, 
        COUNT(sales_amount) as num_of_transaction,
        COUNT(DISTINCT year_month_day) as date_count
        FROM(
            SELECT SUBSTR(date_receipt_num,1,10) AS year_month_day,
            SUBSTR(tr_time,1,2) AS time_hour,
            sales_amount
            FROM ext_tr_receipt WHERE SUBSTR(date_receipt_num,1,10) IN ('%s')
            """ % date_list +
            """
        ) view_tr_total_amount_by_dayofweek
        GROUP BY time_hour ORDER BY time_hour ASC
        """
    )
    df_by_hour = as_pandas(cur)
    conn.close()
    
    df_by_hour.set_index('time_hour',inplace=True)
    df_by_hour = df_by_hour.reindex([[str(i) for i in np.arange(10,24)]],fill_value=0)

    dict_result = df_by_hour['total_amount'].to_dict()
    dict_result['date'] = date_list[0]
     
    redis_io.write_dict_transaction(REDIS_KEY_PREFIX + ":" + cur_date, dict_result, 60*60)
    
    ret_dict = get_cache_value(cur_date)

    return ret_dict
コード例 #40
0
ファイル: Qrys.py プロジェクト: P1R/RHPy
 def AvgMonth(self, Cursor, Month, Units, Limit=1):
     '''GetAverage of month by units'''
     File=open('AvgMonth.csv', 'w')
     File.write("Unidad, promedio\n")
     for item in Units:
         Qry = """
         select avg(precio_per_mw_10) 
         from ofertas_energia_david
         where tipo_reporte='TE' and unidad='{1}' 
         and month(fecha_inicial)={0} and precio_per_mw_10 > 0
         limit {2}
         """.format(Month, item,Limit)
         Cursor.execute(Qry)
         #da formato pandas
         df = as_pandas(Cursor)
         File.write("{0}, {1}\n".format(item,str(df['c0'][0])))
     File.close()
コード例 #41
0
def get_product_data(product_name):
    
    # Redis read cache value
    REDIS_KEY_PREFIX = "popular_product_info"
    
    def get_cache_value(product_name):
        cache_data = redis_io.read_dict_transaction(REDIS_KEY_PREFIX + ":" + product_name, ['product_code','price'])
        if cache_data == None:
            return None
    
        dict_data = {}
        dict_data['product_code'] = cache_data[0]
        dict_data['price'] = cache_data[1]
        return dict_data
    

    cached_data = get_cache_value(product_name)
    if cached_data != None:
        return cached_data
    
    conn = connect(host='salest-master-server', port=21050)
    cur = conn.cursor()
    cur.execute('USE salest')
  
    queryStr = """SELECT product_name,price,product_code FROM ext_menumap_info"""
    
    cur.execute(queryStr)
    df_categories = as_pandas(cur)

    df_categories = df_categories[df_categories.price != 0]
    #df_categories.set_index('product_name', inplace=True)
    
    for idx,row in df_categories.iterrows():
        key = "{0}:{1}".format(REDIS_KEY_PREFIX,row.product_name)
        value = row[['product_code','price']].to_dict()
        redis_io.write_dict_transaction(key, value, 60*60*24*30)
    
    cached_data = redis_io.read_dict_transaction(REDIS_KEY_PREFIX + ":" + product_name, ['product_code','price'])
    return get_cache_value(product_name)
コード例 #42
0
ファイル: bdf.py プロジェクト: fkaufer/impyla
 def collect(self):
     """Return the BDF data to the client as a pandas DataFrame"""
     return as_pandas(self.__iter__())
コード例 #43
0
def analysis_timebase_sales_amount(year, day_of_week):
    
    # Redis read cache value
    REDIS_KEY = "timebase_sales_amount:{0}:{1}".format(year,day_of_week)
    cached_data = redis_io.read_transaction(REDIS_KEY)
    
    if cached_data != None:
        return ast.literal_eval(cached_data)
    #
    
    conn = connect(host='salest-master-server', port=21050)
    cur = conn.cursor()

    cur.execute('USE salest')
    
    bIsRealTimeUpdated = REFRESH_IMPALA_TABLE(cur, year)
        
    start_date = "%s/01/01" % year
    end_date = "%s/12/31" % year
    
    if(day_of_week=='All'):
        target_date_idx = pd.date_range(start_date,end_date)
    else:
        target_date_idx = pd.date_range(start_date,end_date, freq=day_of_week)
        
    target_date_arr = target_date_idx.strftime('%Y-%m-%d')
    target_date_tuple = tuple(target_date_arr)

    cur.execute(
        """
        SELECT time_hour, CAST(SUM(sales_amount) as INTEGER) AS total_amount, 
        COUNT(sales_amount) as num_of_transaction,
        COUNT(DISTINCT year_month_day) as date_count
        FROM(
            SELECT SUBSTR(date_receipt_num,1,10) AS year_month_day,
            SUBSTR(tr_time,1,2) AS time_hour,
            sales_amount
            FROM """ + GET_IMPALA_TB_NAME(year) + """ WHERE SUBSTR(date_receipt_num,1,10) IN %s
            """ % (target_date_tuple,) +
            """
        ) view_tr_total_amount_by_dayofweek
        GROUP BY time_hour ORDER BY time_hour ASC
        """
    )
    df_by_weekofday = as_pandas(cur)
    conn.close()
        
    def calc_average_amount(row):
        return row.total_amount / row.date_count

    df_by_weekofday['total_amount'] = df_by_weekofday.apply(calc_average_amount,axis=1)
    df_by_weekofday.set_index('time_hour',inplace=True)
    
    cached_data = df_by_weekofday.to_dict()
    
    if bIsRealTimeUpdated == False:
        # Redis save cache value
        redis_io.write_transaction(REDIS_KEY, cached_data)
        #
    
    return cached_data
コード例 #44
0
def agg_montly_total_amount_by_product_cate(year):
    
    # Redis read cache value
    REDIS_KEY = "monthly_total_amount_per_cate:{0}".format(year)
    cached_data = redis_io.read_transaction(REDIS_KEY)
    
    if cached_data != None:
        return ast.literal_eval(cached_data)
    
    
    conn = connect(host='salest-master-server', port=21050)
    cur = conn.cursor()

    cur.execute('USE salest')
    
    bIsRealTimeUpdated = REFRESH_IMPALA_TABLE(cur, year)
        
    cur.execute(
    """
        SELECT SUBSTR(view_tr_receipt.date_receipt_num,1,7) AS year_month, 
              view_tr_receipt.num_of_product, view_tr_receipt.sales_amount AS total_amount,
            ext_menumap_info.product_name, ext_menumap_info.cate_name, ext_menumap_info.price
        FROM (SELECT * FROM """ + GET_IMPALA_TB_NAME(year) + """ WHERE SUBSTR(date_receipt_num,1,4) = '""" + year + "'" +
    """) view_tr_receipt JOIN ext_menumap_info USING (product_code)"""
    )

    df_tr_receipt_menumap = as_pandas(cur)
    conn.close()
    
    def aggregation(row):
        total_amount = row['total_amount'].sum()
        return pd.Series([total_amount], index=['total_amount'])
    
    df_monthly_product_tr = df_tr_receipt_menumap.groupby(['year_month','cate_name']).apply(aggregation)
    
    df_default = genDefaultMontlyCateTotalAmountDataFrame(df_monthly_product_tr,year, 'cate_name')
    df_all_monatly_sales_volume = pd.merge(df_default, df_monthly_product_tr, left_index=True, right_index=True, how='outer').fillna(0).sort_index(ascending='1')

    def post_aggregation(row):
        return row['total_amount_x'] + row['total_amount_y']
    
    df_all_monatly_sales_volume['total_amount'] = df_all_monatly_sales_volume.apply(post_aggregation, axis=1)
    df_all_monatly_sales_volume.drop(['total_amount_x','total_amount_y'], axis=1, inplace=True)

    def gen_dict_total_amount(month_rows):
        monthlyDict = {}
        monthlyDictKey = month_rows.index.get_level_values('year_month')[0]
        
        monthCateItemsStr = "{"
        for item in zip(month_rows.index.get_level_values('cate_name'),month_rows['total_amount']):
            monthCateItemsStr += "'{0}':{1},".format(item[0],item[1]);
        
        monthCateItemsStr = monthCateItemsStr[:-1]
        monthCateItemsStr += "}"
        
        monthlyDict = ast.literal_eval(monthCateItemsStr)
        monthlyDict['year_month'] = month_rows.index.get_level_values('year_month')[0]

        return monthlyDict
    
    mothlyTotalAmountDictItems = df_all_monatly_sales_volume.groupby(df_all_monatly_sales_volume.index.get_level_values('year_month')).apply(gen_dict_total_amount)    
 
    mothlyTotalAmountDict = {}
    mothlyTotalAmountList = []
    for item in mothlyTotalAmountDictItems:
        mothlyTotalAmountList.append(item)
    mothlyTotalAmountDict['total_amount'] = mothlyTotalAmountList

    if bIsRealTimeUpdated == False:
        # Redis save cache value
        redis_io.write_transaction(REDIS_KEY, mothlyTotalAmountDict)
        #
    
    return mothlyTotalAmountDict
コード例 #45
0
ptb_mothers = open('PTB1v4_mothers.txt', 'r').readlines()
ptb_list = map(str.strip, ptb_mothers)

##################
## create queries ##
##################
from impala.util import as_pandas
ptb_query = ("SELECT * "
       "FROM p7_ptb.illumina_variant as ill, public_hg19.ensembl_genes as ens "
       "WHERE ens.gene_name IN ('" + "','".join(gene_list) + "') " +
       "AND ill.sample_id IN ('" + "','".join(ptb_list) + "') " +
       "AND ens.chromosome = ill.chr AND (ill.pos >= ens.start AND ill.pos <= ens.stop)"
       )

cur.execute(ptb_query)
ptb_results = as_pandas(cur)
ptb_results.to_csv("./ptb_variants_Ill.tsv", sep="\t")

ptb_query = ("SELECT * "
                    "FROM p7_ptb.comgen_variant as com, public_hg19.ensembl_genes as ens "
                    "WHERE ens.gene_name IN ('" + "','".join(gene_list) + "') " +
                    "AND com.sample_id IN ('" + "','".join(ptb_list) + "') " +
                    "AND ens.chromosome = com.start AND (com.start >= ens.start AND com.start<= ens.stop)"
                    )

cur.execute(ptb_query)
ptb_results = as_pandas(cur)
ptb_results.to_csv("./ptb_variants_com.tsv", sep="\t")


ftb_query = ("SELECT * "
コード例 #46
0
def queryEvents(eventLevels,whereClauses,fromRootThresholdPct,fromParentThresholdPct,appName,startEvent,startDate,endDate):
    print "This will take a few minutes"
    vID = 0
    levelID = 1
    if len(whereClauses) > 0:
        where = 'true '
        for eachClause in whereClauses:
            where = where + "and " + eachClause

    #Query Impala
    query = """select distinct
                   0 as vID,
                   0 as levelID,
                   0 as parentVID,
                   event_name,
                   distinct_id
                   from
                   fact_beacon_history
                   where
                   %s
                   and event_name = '%s'
                   and app_name = '%s'
                   and date_sid between '%s' and '%s'
                   """ % (where, startEvent, appName, startDate, endDate)
    conn = connect(host='52.89.99.148', port=21050)
    cur = conn.cursor()
    t0 = datetime.datetime.utcnow()
    cur.execute(query)
    t1 = datetime.datetime.utcnow()
    #print t1-t0
    treeData = as_pandas(cur)
    rootUserCount = len(treeData['distinct_id'])
    vID = 1
    # parentDistinctIDs = pd.DataFrame(pd.unique(treeData['distinct_id'])).values

    while levelID < eventLevels:
        uniques = treeData[treeData['levelid'] == levelID - 1].drop_duplicates(subset=['event_name','vid'],take_last=True)[['event_name','vid']].values
        eventList = treeData[treeData['levelid'] == levelID - 1].drop_duplicates(subset=['event_name','vid'],take_last=True)[['event_name','vid']].values
        for eachEvent, pvid in eventList:
            #Query Impala
            query = """select distinct
                       event_name
                       from
                       fact_beacon_history
                       where
                       %s
                       and app_name = '%s'
                       and prev_event = '%s'
                       and date_sid between '%s' and '%s'""" % (where, appName, eachEvent, startDate, endDate)
            conn = connect(host='52.89.99.148', port=21050)
            cur = conn.cursor()
            t0 = datetime.datetime.utcnow()
            cur.execute(query)
            t1 = datetime.datetime.utcnow()
            #print t1-t0
            events = as_pandas(cur)
            parentVID = pvid
            #parentVID = treeData[(treeData['levelid'] == levelID - 1) & (treeData['event_name'] == eachEvent)]['vid'].values[0]

            for eachSubEvent in events[events['event_name'] != startEvent]['event_name']:
                parentUserCount = len(treeData[(treeData['levelid'] == levelID - 1) & (treeData['event_name'] == eachEvent)])
                parentRootConv = np.true_divide(parentUserCount,rootUserCount)*100
                if parentRootConv > fromRootThresholdPct:
                    query = """select distinct
                               %s as vID,
                               %s as levelID,
                               %s as parentVID,
                               event_name,
                               distinct_id
                               from
                               fact_beacon_history
                               where
                               %s
                               and app_name = '%s'
                               and event_name = '%s'
                               and prev_event = '%s'
                               and date_sid between '%s' and '%s'
                              """ % (vID, levelID, parentVID, where, appName, eachSubEvent, eachEvent, startDate, endDate)
                    conn = connect(host='52.89.99.148', port=21050)
                    cur = conn.cursor()
                    t0 = datetime.datetime.utcnow()
                    cur.execute(query)
                    t1 = datetime.datetime.utcnow()
                    addTreeData = as_pandas(cur)
                    #parentDistinctIDs = treeData[(treeData['levelid'] == levelID - 1) & (treeData['event_name'] == eachEvent)]['distinct_id'].values
                    parentDistinctIDs = treeData[treeData['vid'] == parentVID].values
                    addRecord = False
                    for eachID in addTreeData['distinct_id']:
                        if eachID in parentDistinctIDs:
                            treeData = pd.concat([treeData,addTreeData[addTreeData['distinct_id'] ==  eachID]])
                            addRecord = True
                    tempGraphData = pd.DataFrame({'user_count' : treeData.groupby(['vid','levelid','parentvid','event_name'], as_index = False).size()}).reset_index()
                    nodeUserCount = tempGraphData[tempGraphData['vid'] == max(tempGraphData['vid'])]['user_count']
                    fromParentConv = np.true_divide(nodeUserCount,parentUserCount).values*100
                    if addRecord == True and fromParentConv > fromParentThresholdPct:
                        vID +=1
                    elif addRecord == True:
                        treeData = treeData[treeData['vid'] != max(treeData['vid'])]
        levelID += 1
    #print treeData
    return treeData
コード例 #47
0
ファイル: bdf.py プロジェクト: cgc17/impyla
 def head(self, row_count = 5):
     head_query = ('SELECT * FROM (%s) AS head_tbl LIMIT %d' %
                    (self._query_ast.to_sql(), row_count))
     self._ic._cursor.execute(head_query)
     return as_pandas(self._ic._cursor)
コード例 #48
0
def get_most_popular_products(req_cate_name):

        
    # Redis read cache value
    REDIS_KEY = "product_category_info"
    
    def get_cache_value(req_cate_name):
        subKeyList = []
        subKeyList.append(req_cate_name)
        return redis_io.read_dict_transaction(REDIS_KEY, subKeyList)

        
    # Redis read cache value
    #REDIS_KEY = "product_category_info"
    #cached_data = redis_io.read_dict_transaction(REDIS_KEY,req_cate_name)

    #if cached_data != None:
    #    return ast.literal_eval(cached_data[0])
    cached_data = get_cache_value(req_cate_name)
    if cached_data != None:
        return ast.literal_eval(cached_data[0])
    
    
    conn = connect(host='salest-master-server', port=21050)
    cur = conn.cursor()
    cur.execute('USE salest')

    dict_product_cate_items = {}

    # Category Items

    queryStr = """
            SELECT DISTINCT cate_name FROM ext_menumap_info
            """

    cur.execute(queryStr)
    df_categories = as_pandas(cur)
    dict_product_cate_items['All'] = df_categories['cate_name'].values.tolist()


    # Most 10 papular items per each category

    for cate_name in dict_product_cate_items['All']:
        query_str = """
            SELECT product_name, SUM(sales_amount) AS total_amount
            FROM
            (
                SELECT cate_name,product_name,date_receipt_num,sales_amount
                FROM 
                    (SELECT * FROM ext_menumap_info WHERE cate_name = '""" + cate_name + """' ) view_specific_menu JOIN ext_tr_receipt USING (product_code)
            ) view_tr_specific_cate_menu
            GROUP BY (view_tr_specific_cate_menu.product_name)
            ORDER BY (SUM(sales_amount)) DESC
            LIMIT 10
            """
        cur.execute(query_str)
  
        df_papular_products = as_pandas(cur)
        df_papular_products = df_papular_products[df_papular_products.total_amount != 0]
        dict_product_cate_items[cate_name] = df_papular_products['product_name'].values.tolist()

    conn.close()

    # Redis save cache value
    redis_io.write_dict_transaction(REDIS_KEY, dict_product_cate_items, 60*60*24*30)
    #
    cached_data = get_cache_value(req_cate_name)
    return ast.literal_eval(cached_data[0])
コード例 #49
0
def agg_montly_sales_volumn(year,unit_numofproduct, unit_totalamount):
    
    # Redis read cache value
    REDIS_KEY = "monthly_sales_vol:{0}".format(year)
    cached_data = redis_io.read_transaction(REDIS_KEY)
    
    if cached_data != None:
        return  ast.literal_eval(cached_data)
    #
    
    conn = connect(host='salest-master-server', port=21050)
    cur = conn.cursor()

    cur.execute('USE salest')
    
    bIsRealTimeUpdated = REFRESH_IMPALA_TABLE(cur, year)
    
    cur.execute("""
        SELECT year_month, SUM(num_of_product) AS num_of_product, SUM(sales_amount) AS total_amount
        FROM (
            SELECT SUBSTR(date_receipt_num,1,7) AS year_month, num_of_product, sales_amount
            FROM """ + GET_IMPALA_TB_NAME(year) + """ WHERE SUBSTR(date_receipt_num,1,4) = '""" + year +
        """'
        ) view_tr_recipt
        GROUP BY year_month ORDER BY year_month ASC
        """
    )
    df = as_pandas(cur)
    conn.close()
    
    ### Fill non-included monthly row with zero base values.
    month_index_arr = []

    for month in range(1,13):
        month_index_arr.append("{0}-{1:02d}".format(year,month))
    
    df_base_index = pd.DataFrame(data=month_index_arr, columns=['year_month'])
    df_all_monatly_sales_volume = pd.merge(df, df_base_index, on='year_month',how='outer').fillna(0).sort_values(by='year_month',ascending='1')
    ###

    df_list = list(df_all_monatly_sales_volume.itertuples(index=False))
    df_column_name_list = list(df.columns.values)

    list_month_sales_volume = []
    dict_month_sales_volume = {}

    for row in df_list:
        dict_month_sales_volume = {}
        
        for key,value in zip(df_column_name_list, row):
            if(key=='num_of_product'):
                value = int(round(value / unit_numofproduct))
            if(key=='total_amount'):
                value = int(round(value / unit_totalamount))
            dict_month_sales_volume[key] = value
        
        list_month_sales_volume.append(dict_month_sales_volume.copy())

    if bIsRealTimeUpdated == False:
        # Redis save cache value
        redis_io.write_transaction(REDIS_KEY, list_month_sales_volume)
        #
    
    return list_month_sales_volume
コード例 #50
0
def agg_montly_total_amount_by_product(year, product_cate):
    
    # Redis read cache value
    REDIS_KEY = "monthly_total_amount_per_product:{0}:{1}".format(year,product_cate.encode("UTF-8"))
    cached_data = redis_io.read_transaction(REDIS_KEY)
    
    if cached_data != None:
        return ast.literal_eval(cached_data)
    #
    
    conn = connect(host='salest-master-server', port=21050)
    cur = conn.cursor()
    
    cur.execute('USE salest')
    
    bIsRealTimeUpdated = REFRESH_IMPALA_TABLE(cur, year)
    
    query_str = """
        SELECT * FROM (
            SELECT SUBSTR(view_tr_receipt.date_receipt_num,1,7) AS year_month, 
                  view_tr_receipt.num_of_product, view_tr_receipt.sales_amount AS total_amount,
                  ext_menumap_info.product_name, ext_menumap_info.cate_name, ext_menumap_info.price
            FROM (SELECT * FROM """ + GET_IMPALA_TB_NAME(year) + """ WHERE SUBSTR(date_receipt_num,1,4) = '%s'
            ) view_tr_receipt JOIN ext_menumap_info USING (product_code)
        ) view_tr_receipt_menumap
        WHERE cate_name = '%s'
        """  % (year,product_cate)
        
    cur.execute(query_str.encode("UTF-8"))
    
    df_monthly_product_tr = as_pandas(cur)
    conn.close()
        
    column_func_tuple = [('total_amount','sum')]
    df_monthly_summary = df_monthly_product_tr.groupby(['year_month','product_name'])['total_amount'].agg(column_func_tuple)
    df_monthly_summary.rename(columns={'total_amount': 'total_amount_B'}, inplace=True)

    df_default = genDefaultMontlyCateTotalAmountDataFrame(df_monthly_summary,year, 'product_name')
    df_default.rename(columns={'total_amount': 'total_amount_A'}, inplace=True)

    df_per_category = pd.concat([df_default, df_monthly_summary], axis=1).fillna(0)

    def post_aggregation(row):
        return row[0] + row[1]
    
    df_per_category['total_amount'] = df_per_category.apply(post_aggregation, axis=1)
    df_per_category.drop(['total_amount_A','total_amount_B'],axis=1,inplace=True)
  
     # Overall Top 10 menu items in category 
    
    df_topten_products_by_total_amount = df_monthly_product_tr.groupby(['product_name']).sum().sort_values(by='total_amount', ascending=False)[:10]
    df_topten_products_by_total_amount.drop(['num_of_product'],axis=1, inplace=True)
    df_topten_products_by_total_amount.rename(columns={'total_amount':'overall_total_amount'},inplace=True)

    # Redis save cache value
    redis_io.write_transaction(product_cate, df_topten_products_by_total_amount.index.tolist(), 60*60*24*7)
    #
    
    # Merge the above two dataframes
    df_new = df_per_category.reset_index(level=0)
    df_merged = pd.merge(df_new, df_topten_products_by_total_amount, left_index=True, right_index=True, how='left').sort_values(by='year_month', ascending=True)
    
    def agg_monthly_items_summary(row):
        sr_columns = row[row['overall_total_amount'].notnull()].index
        sr_values = row[row['overall_total_amount'].notnull()]['total_amount']

        etcSum = row[row['overall_total_amount'].isnull()]['total_amount'].sum()

        sr_columns = sr_columns.insert(sr_columns.size,'ETC')
        sr_etc = pd.Series([etcSum], index=['ETC'])
        sr_values = sr_values.append(sr_etc)

        return pd.Series(sr_values, index=sr_columns)
    
    df_merged_new = df_merged.reset_index(level=0)

    df_agg_monthly_summary = df_merged.groupby(['year_month']).apply(agg_monthly_items_summary)#.unstack()
    df_agg_monthly_summary.fillna(0,inplace=True)
    
    monthlyDictItems = df_agg_monthly_summary.apply(gen_dict_total_amount,axis=1)
    
    mothlyTotalAmountDict = {}
    mothlyTotalAmountList = []
    for item in monthlyDictItems:
        mothlyTotalAmountList.append(item)
    mothlyTotalAmountDict['total_amount'] = mothlyTotalAmountList
    
    if bIsRealTimeUpdated == False:
        # Redis save cache value
        redis_io.write_transaction(REDIS_KEY, mothlyTotalAmountDict)
        #
    
    return mothlyTotalAmountDict
コード例 #51
0
  from impala.dbapi import connect
  conn = connect(host='ec2-54-86-98-154.compute-1.amazonaws.com',port=21050)
  cur = conn.cursor()

  # Describe Kaviar
  cur.execute('use p7_ref_grch37')
  cur.execute('describe kaviar')
  print("Describing Kaviar fields...")
  print('\n'.join(' - '.join(elems) for elems in cur.fetchall()))

  # Getting Kaviar Data
  # this query retrieves chrom 8 p arm 0-23100000
  print("Selecting data from kaviar for chromosome 8 pos 0-23100000")
  cur.execute('select chrom, pos, allele_freq from kaviar where chrom = "8"')
  #cur.execute('select chrom, pos, allele_freq from kaviar where chrom = "8" and pos >= 0 and pos <= 23100000')
  kaviar = as_pandas(cur) # this will take a relatively long time
  kaviar["chrom"] = 'chr' + kaviar["chrom"].astype(str) # need to add chr to the front to match the ideogram and ucsc_genes
  kaviar.to_csv("kaviar_results.txt")
  print("Finished saving kaviar data")

kaviar["start"] = kaviar["pos"]
kaviar["end"] = kaviar["pos"]
kaviar["width"] = kaviar.end - kaviar.start +10
kaviar['colors'] = '#43a822' # green

# bin kaviar data
binned_kaviar = pd.DataFrame(columns=('chrom','start','end','avg','max_af','min_af'))
for chrom, group in kaviar.groupby('chrom'):
	width = group.shape[0]
	binsize = 10000
	num_bins = width/binsize
コード例 #52
0
from impala.dbapi import connect
from impala.util import as_pandas
import traceback

select = "SELECT * FROM tab LIMIT 100"

try:
    conn = connect(host='your_host', port=21050)
    cursor = conn.cursor()
    try:
        cursor.execute(select)
        print cursor.description  # prints the result set's schema

        df = as_pandas(cursor)
        print df
        
        df.to_csv(path_or_buf='C:/Users/curycu/workspace/select.txt', index=False) # save as text file
    except:
        print "Error getting list of tables."
        print traceback.format_exc()
    cursor.close()
except:
    print "Error establishing connection to Impala."