def test_thrift_http_auth_none(self): rootdir = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) orig_http = os.path.join(rootdir, 'scripts', 'travis-conf', 'hive', 'hive-site-http-none.xml') orig_none = os.path.join(rootdir, 'scripts', 'travis-conf', 'hive', 'hive-site.xml') des = os.path.join('/', 'etc', 'hive', 'conf', 'hive-site.xml') try: subprocess.check_call(['sudo', 'cp', orig_http, des]) _restart_hs2(10001) with contextlib.closing( hive.connect(host=_HOST, username='******', thrift_transport_protocol='http', auth='NONE', http_path='/')) as connection: with contextlib.closing(connection.cursor()) as cursor: cursor.execute('SELECT * FROM one_row') self.assertEqual(cursor.fetchall(), [(1, )]) with contextlib.closing( hive.connect( host=_HOST, thrift_transport_protocol='http')) as connection: with contextlib.closing(connection.cursor()) as cursor: cursor.execute('SELECT * FROM one_row') self.assertEqual(cursor.fetchall(), [(1, )]) finally: subprocess.check_call(['sudo', 'cp', orig_none, des]) _restart_hs2()
def test_ldap_connection(self): rootdir = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) orig_ldap = os.path.join(rootdir, 'scripts', 'travis-conf', 'hive', 'hive-site-ldap.xml') orig_none = os.path.join(rootdir, 'scripts', 'travis-conf', 'hive', 'hive-site.xml') des = os.path.join('/', 'etc', 'hive', 'conf', 'hive-site.xml') try: subprocess.check_call(['sudo', 'cp', orig_ldap, des]) subprocess.check_call(['sudo', 'service', 'hive-server2', 'restart']) time.sleep(10) with contextlib.closing(hive.connect( host=_HOST, username='******', auth='LDAP', password='******') ) as connection: with contextlib.closing(connection.cursor()) as cursor: cursor.execute('SELECT * FROM one_row') self.assertEqual(cursor.fetchall(), [(1,)]) self.assertRaisesRegexp( TTransportException, 'Error validating the login', lambda: hive.connect( host=_HOST, username='******', auth='LDAP', password='******') ) finally: subprocess.check_call(['sudo', 'cp', orig_none, des]) subprocess.check_call(['sudo', 'service', 'hive-server2', 'restart']) time.sleep(10)
def test_invalid_kerberos_config(self): """kerberos_service_name should be set if and only if using KERBEROS""" self.assertRaisesRegexp( ValueError, 'kerberos_service_name.*KERBEROS', lambda: hive.connect(_HOST, kerberos_service_name='')) self.assertRaisesRegexp(ValueError, 'kerberos_service_name.*KERBEROS', lambda: hive.connect(_HOST, auth='KERBEROS'))
def test_custom_connection(self): rootdir = os.path.dirname(os.path.dirname(os.path.dirname(__file__))) orig_ldap = os.path.join(rootdir, 'scripts', 'travis-conf', 'hive', 'hive-site-custom.xml') orig_none = os.path.join(rootdir, 'scripts', 'travis-conf', 'hive', 'hive-site.xml') des = os.path.join('/', 'etc', 'hive', 'conf', 'hive-site.xml') try: subprocess.check_call(['sudo', 'cp', orig_ldap, des]) _restart_hs2() with contextlib.closing( hive.connect(host=_HOST, username='******', auth='CUSTOM', password='******')) as connection: with contextlib.closing(connection.cursor()) as cursor: cursor.execute('SELECT * FROM one_row') self.assertEqual(cursor.fetchall(), [(1, )]) self.assertRaisesRegexp( TTransportException, 'Error validating the login', lambda: hive.connect(host=_HOST, username='******', auth='CUSTOM', password='******')) finally: subprocess.check_call(['sudo', 'cp', orig_none, des]) _restart_hs2()
def open(cls, connection): if connection.state == 'open': logger.debug('Connection is already open, skipping open.') return connection creds = connection.credentials connect_retries = creds.get('connect_retries', 0) connect_timeout = creds.get('connect_timeout', 10) exc = None for i in range(1 + connect_retries): try: if creds.method == 'http': cls.validate_creds( creds, ['token', 'host', 'port', 'cluster', 'organization']) conn_url = SPARK_CONNECTION_URL.format(**creds) transport = THttpClient.THttpClient(conn_url) raw_token = "token:{}".format(creds.token).encode() token = base64.standard_b64encode(raw_token).decode() transport.setCustomHeaders( {'Authorization': 'Basic {}'.format(token)}) conn = hive.connect(thrift_transport=transport) elif creds.method == 'thrift': cls.validate_creds(creds, ['host']) conn = hive.connect(host=creds.host, port=creds.get('port'), username=creds.get('user')) break except Exception as e: exc = e if getattr(e, 'message', None) is None: raise message = e.message.lower() is_pending = 'pending' in message is_starting = 'temporarily_unavailable' in message warning = "Warning: {}\n\tRetrying in {} seconds ({} of {})" if is_pending or is_starting: logger.warning( warning.format(e.message, connect_timeout, i + 1, connect_retries)) time.sleep(connect_timeout) else: raise else: raise exc wrapped = ConnectionWrapper(conn) connection.state = 'open' connection.handle = wrapped return connection
def test_invalid_http_basic_auth(self): self.assertRaisesRegexp( ValueError, 'BASIC authentication requires password.', lambda: hive.connect( host=_HOST, thrift_transport_protocol='http', auth='BASIC')) self.assertRaisesRegexp( ValueError, 'BASIC authentication requires password.', lambda: hive.connect(host=_HOST, thrift_transport_protocol='http', auth='BASIC', username='******'))
def execute(self, quals, columns): if self.query: statement = self.query else: statement = "SELECT " + ",".join(self.columns.keys()) + " FROM " + self.table log_to_postgres('Hive query: ' + unicode(statement), DEBUG) try: client = hive.connect(self.host ,username='******', port=self.port) cursor = client.cursor() cursor.execute(statement) for row in cursor.fetchall(): line = {} idx = 0 for column_name in self.columns: line[column_name] = row[idx] idx = idx + 1 yield line except NotImplementedError, ix: log_to_postgres(ix.message, ERROR)
def load_data(self, path, partition_val): """ Loads the file with the specified path in STDIN to Hive Parameters ---------- path: <string> path specified in STDIN partition_val: <list> partition specified in STDIN Note ---- The query to execute is carved in this code. Fix it a little as needed. """ partition_lst = [] for x in range(len(self.partitions)): partition_lst.append("%s='%s'" % (self.partitions[x], partition_val[x])) partition_str = ','.join(lst) query = "LOAD DATA INPATH '%s' \ OVERWRITE INTO TABLE %s \ PARTITION(%s)" % (path, self.table, partition_str) __LOG__.Trace(query) try: self.cursor = hive.connect(**self.conn_info).cursor() self.cursor.execute(query) except Exception, ex: __LOG__.Trace(ex)
def get_jd_data_hive(): ''' 从hive中取出来的数据类型为list,以及每一项都为一个tuple,每一个单元格的数据都为tuple中的一个元素. 2235063 <class 'list'> ('62456977355',) :return: ''' try: idx = 0 t0 = time.time() conn = hive.connect(host='172.20.207.6', port=10000, username='******') # conn = connect(host='172.20.207.6', port=10000, auth_mechanism="PLAIN") cur = conn.cursor() sql = "select spu_id from dim.dim_retailers_online_spu_sku where platform_type = 'jd'" cur.execute(sql) data = cur.fetchall() print("finish getting data!") ret_data = list_of_groups(data,24000000) print("hive get data finish!") for ret_item in ret_data: idx += 1 with open("./data/output_data_" + str(idx) + ".txt","w",encoding="utf-8") as f1: for item in ret_item: f1.write(item[0] + "\n") f1.flush() cur.close() conn.close() t1 = time.time() return data except Exception as e: print(traceback.format_exc())
def hiveQuery(SQL): os.system("kinit bigf_admin -kt /etc/bigf.keytab") cursor = hive.connect(host=hiveHost,auth="KERBEROS",kerberos_service_name="hive").cursor() cursor.execute("add jar /opt/cloudera/parcels/CDH/jars/hive-contrib-1.1.0-cdh5.12.0.jar") cursor.execute(SQL) bb = cursor.fetchall() return bb
def connInstance(self, host=None, port=10000, username=None, schema='default', auth=None, password=None): i = 0 while i < TRY_CONNECT_TIMES: try: setattr( self, self.conn_key, hive.connect(host=host, port=port, username=username, database=schema, auth=auth, password=password).cursor()) break except Exception, e: output('hive Exception ' + str(e), logType='hive') j = 60 if i >= 4 else i * random.randint(1, 5) # 3次连接不上则发送警告,但不终止,继续尝试连接 if i == 10 or i == 50: _msg = "Can't connect hive %s@%s %s times" % ( self.conn_key[5:], _binname, i) output(_msg, logType='hive') notice_me(_msg) time.sleep(j) i += 1
def get_change_handle(table_id, table, target_database): if metadata.get_change_ddl(table_id): cursor = hive.connect(host=config.hiveserver2, username=config.hive_user, port=config.hive_port).cursor() cursor.execute(const.DROP_SQL % (target_database, table)) cursor.close()
def create_mask_table(target_database, target_table, field, datatype, partition_key, table_comment, field_comment): content = "" i = 0 """ 野蛮粗暴用string类型,避免数据类型冲突问题。 (例如:bigint类型的身份证号,脱敏之后存储到bigint字段,会造成查询只有null) """ for item in field: if i == 0: content += item + " string COMMENT '" + field_comment[i] + "'" else: content += "," + item + " string COMMENT '" + field_comment[i] + "'" i += 1 partition = partition_key + " string" args = { "database": target_database, "table": target_table, "content": content, "partition": partition, "comment": table_comment } template = Template(mask_const.CREATE_MASK_TABLE) cursor = hive.connect(host=config.hiveserver2, username=config.hive_user, port=config.hive_port).cursor() cursor.execute(template.substitute(args)) cursor.close()
def get_sku_10w(output_check_file, cat1_name, data_table): sample_check_list = [] save_value_list = [] conn = hive.connect(host='172.20.207.6', port=10000, username='******') cur = conn.cursor() new_dt = get_new_dt(cur, data_table, cat1_name) try: sql_str = """select * from %s a left join dwi.dwi_retailers_online_platform_info_pdd_10w c on c.sku_id = a.sku_id where a.dt = '%s' and c.sku_id is not null and a.cat1_name='%s'""" % ( data_table, new_dt, cat1_name) cur.execute(sql_str) data_tuple = cur.fetchall() sample_check_list.append(data_tuple) except Exception as e: print(traceback.format_exc()) for epoch in sample_check_list: for item in epoch: save_value_list.append(item) save_value_list.insert(0, ['sku_id', 'title', 'brand_std_id', 'brand_std_name', 'match_type_name', 'cat1_id', 'cat1_name', 'cat2_id', 'cat2_name', \ 'cat3_id', 'cat3_name']) writeExcel(output_check_file, save_value_list, '10w+商品')
def create_hive_parq_table(): cursor = hive.connect('localhost').cursor() sql = ''' create external table example_parq(one double, two string, three boolean) STORED AS PARQUET location 's3a://example-parquet/' ''' cursor.execute(sql)
def hiveconnection(inSql): if (kerberos_enabled): auth = "KERBEROS" kerberos_service_name = "hive" password = None else: password = api.config.password auth = 'CUSTOM' kerberos_service_name = None if (http_enabled): conn = hive.connect(thrift_transport=add_http_mode_support()) else: conn = hive.Connection(host=hostname, port=port, username=user, password=password, database=database, auth=auth, kerberos_service_name=kerberos_service_name) cur = conn.cursor() cur.execute(inSql) resultList = cur.fetchall() string = "" for x in resultList: for y in x: string = string + str( y ) + api.config.delimiter ## Delimiter to separate Hive columns in output string = string + "\n" api.send("output", string)
def get_conn(self, schema=None): """ Returns a Hive connection object. """ db = self.get_connection(self.hiveserver2_conn_id) # pylint: disable=no-member auth_mechanism = db.extra_dejson.get('authMechanism', 'NONE') if auth_mechanism == 'NONE' and db.login is None: # we need to give a username username = '******' kerberos_service_name = None if conf.get('core', 'security') == 'kerberos': auth_mechanism = db.extra_dejson.get('authMechanism', 'KERBEROS') kerberos_service_name = db.extra_dejson.get( 'kerberos_service_name', 'hive') # pyhive uses GSSAPI instead of KERBEROS as a auth_mechanism identifier if auth_mechanism == 'GSSAPI': self.log.warning( "Detected deprecated 'GSSAPI' for authMechanism " "for %s. Please use 'KERBEROS' instead", self.hiveserver2_conn_id # pylint: disable=no-member ) auth_mechanism = 'KERBEROS' from pyhive.hive import connect return connect(host=db.host, port=db.port, auth=auth_mechanism, kerberos_service_name=kerberos_service_name, username=db.login or username, password=db.password, database=schema or db.schema or 'default')
def process_data(self, file_location): result = 'ok' try: cursor = hive.connect(self.ip, port=self.port, username=self.hive_user_name, database=self.database ).cursor() new_file_location = file_location + '.COMPLETE' shutil.move(file_location, new_file_location) table = self.get_table(file_location) assert table is not None LOAD_HSQL = "LOAD DATA LOCAL INPATH '%s' INTO TABLE %s" % (new_file_location, table) self.log.debug(LOAD_HSQL) cursor.execute(LOAD_HSQL) except: self.log.warning(traceback.format_exc()) result = 'Fail' finally: cursor.close() if 'ok' == result: os.remove(new_file_location) self.log.info(new_file_location+' is deleted.') return result,
def test_invalid_transport(self): """transport and auth are incompatible""" socket = thrift.transport.TSocket.TSocket('localhost', 10000) transport = thrift.transport.TTransport.TBufferedTransport(socket) self.assertRaisesRegexp( ValueError, 'thrift_transport cannot be used with', lambda: hive.connect(_HOST, thrift_transport=transport))
def _get_connection(self): host = self.configuration['host'] scheme = self.configuration.get('http_scheme', 'https') # if path is set but is missing initial slash, append it path = self.configuration.get('http_path', '') if path and path[0] != '/': path = '/' + path # if port is set prepend colon port = self.configuration.get('port', '') if port: port = ':' + str(port) http_uri = "{}://{}{}{}".format(scheme, host, port, path) # create transport transport = THttpClient.THttpClient(http_uri) # if username or password is set, add Authorization header username = self.configuration.get('username', '') password = self.configuration.get('http_password', '') if username or password: auth = base64.b64encode(username + ':' + password) transport.setCustomHeaders({'Authorization': 'Basic ' + auth}) # create connection connection = hive.connect(thrift_transport=transport) return connection
def _get_connection(self): host = self.configuration["host"] scheme = self.configuration.get("http_scheme", "https") # if path is set but is missing initial slash, append it path = self.configuration.get("http_path", "") if path and path[0] != "/": path = "/" + path # if port is set prepend colon port = self.configuration.get("port", "") if port: port = ":" + str(port) http_uri = "{}://{}{}{}".format(scheme, host, port, path) # create transport transport = THttpClient.THttpClient(http_uri) # if username or password is set, add Authorization header username = self.configuration.get("username", "") password = self.configuration.get("http_password", "") if username or password: auth = base64.b64encode( username.encode("ascii") + b":" + password.encode("ascii")) transport.setCustomHeaders( {"Authorization": "Basic " + auth.decode()}) # create connection connection = hive.connect(thrift_transport=transport) return connection
def __init__(self, connection_string=None, username=None, password=None, proxy_user=None, impersonate=False, *args, **kwargs): with Timeout(120, "Timeout connecting to HiveServer"): connection_conf = get_hive_connection_conf(connection_string) port = 10000 if not connection_conf.port else connection_conf.port configuration = dict(connection_conf.configuration) configuration["mapred.job.queue.name"] = "root.dev-test" if proxy_user and impersonate: configuration["hive.server2.proxy.user"] = proxy_user configuration[ "mapred.job.queue.name"] = "root.users.%s" % proxy_user self._connection = hive.connect( host=connection_conf.host, port=port, database=connection_conf.default_db, auth="LDAP", username=username, password=password, configuration=configuration, ) super(HiveClient, self).__init__()
def execute_query(self, query, data=None): """Run a SELECT statement. Args: query: The SELECT statement to be executed data[Optional]: The data to be used for parametrized query Returns: Returns the result as pandas dataframe """ try: with contextlib.closing( hive.connect(host=self.__host, username=self.__username)) as conn: with contextlib.closing(conn.cursor()) as cursor: cursor.execute(query, data) result = True # In case of dml this is -1 if cursor.rowcount != -1: columns = cursor.description result = \ [{columns[index][0]:column for index, column in enumerate(value)} for value in cursor.fetchall()] # bucket_name = os.environ["AWS_ATHENA_S3_STAGING_DIR"] # s3_client = boto3.client('s3') # # Remove the s3:// part from bucket name # obj = s3_client.get_object(Bucket=bucket_name[5:], Key=result_file) # df = pd.read_csv(io.BytesIO(obj['Body'].read()), encoding='utf8') except Exception as ex: raise (ex) return result
def pyhiveexesql(sql): print(sql) cursor = None try: cursor = hive.connect(host='', port=10000, username='').cursor() cursor.execute(sql, async=True) status = cursor.poll().operationState while status in (TOperationState.INITIALIZED_STATE, TOperationState.RUNNING_STATE): logs = cursor.fetch_logs() for message in logs: print(message) # If needed, an asynchronous query can be cancelled at any time with: # cursor.cancel() status = cursor.poll().operationState #print (cursor.fetchall()) print("测试连接HIVE库,并输出结果!") conn_result = cursor.fetchall() var_len = len(conn_result) #print(var_len,conn_result) var = 0 while var < var_len: print(conn_result[var]) var += 1 except Exception: print('%s' % (message)) finally: cursor.close()
def get_random_sample(output_file, cat1_name, data_table): ''' 所有商品下的随机采样 :param output_file: :return: ''' conn = hive.connect(host='172.20.207.6', port=10000, username='******') cur = conn.cursor() new_dt = get_new_dt(cur, data_table, cat1_name) sku_count = get_table_count(cur, data_table, cat1_name) no_dict = {} sample_check_list = [] save_value_list = [] while True: tmp = random.randint(0, sku_count) if tmp not in no_dict: no_dict[tmp] = '' if len(no_dict) >= 3000: break r_lst = [] for k, v in no_dict.items(): r_lst.append(str(k)) r_lst_tmp = ["'" + str(item) + "'" for item in r_lst] where_cond = "(" + ", ".join(r_lst_tmp) + ")" try: sql_str = """ select x.sku_id, x.title, x.brand_std_id, x.brand_std_name, x.cat1_std_id, x.cat1_std_name, x.cat2_std_id, x.cat2_std_name, x.cat3_std_id, x.cat3_std_name from ( select row_number() over(partition by 1) as rw_no, sku_id, title, brand_std_id, brand_std_name, cat1_std_id, cat1_std_name, cat2_std_id, cat2_std_name, cat3_std_id, cat3_std_name from dwd.dwd_pdd_cat3_brand_reg where cat1_name = '%s' and dt = '%s' ) x where rw_no in %s""" % (cat1_name, new_dt, where_cond) cur.execute(sql_str) data = cur.fetchall() sample_check_list.append(data) except Exception as e: print(traceback.format_exc()) for epoch in sample_check_list: for item in epoch: save_value_list.append(item) save_value_list.insert(0,['sku_id','title','brand_std_id','brand_std_name','cat1_id','cat1_name','cat2_id','cat2_name',\ 'cat3_id','cat3_name']) writeExcel(output_file, save_value_list, '全局随机采样')
def create_hive_tables(bc, dir_data_lc, fileSchemaType, **kwargs): tables = kwargs.get("tables", tpchTables) for i, table in enumerate(tables): cursor = hive.connect("172.22.0.3").cursor() table = bc.create_table(table, cursor) # table = bc.create_table(table, cursor, file_format=fileSchemaType) print(table)
def connect_to_database(self, name, details=None): if details: self.registry[name] = details else: details = self.registry[name] dbtype = details["type"] creds = details["creds"] if dbtype == "mysql": import pymysql self.conns[name] = pymysql.connect(local_infile=True, **creds) self.uris[ name] = 'mysql+pymysql://{user}:{password}@{host}:{port}/{db}'.format( **creds) self.engines[name] = create_engine(self.uris[name]) elif dbtype in ["postgres", "redshift"]: import psycopg2 self.conns[name] = psycopg2.connect(**creds) elif dbtype == "presto": from pyhive import presto self.conns[name] = presto.connect(**creds) self.uris[ name] = 'presto://{username}@{host}:{port}/hive/default'.format( **creds) self.engines[name] = create_engine(self.uris[name]) # engines[name] = create_engine('presto://', creator=lambda: conns[name]) elif dbtype == "hive": from pyhive import hive self.conns[name] = hive.connect(**creds)
def pyhiveexesql(sql): print (sql) cursor = None try: cursor = hive.connect(host='', port=10000, username='').cursor() cursor.execute(sql, async=True) status = cursor.poll().operationState while status in (TOperationState.INITIALIZED_STATE, TOperationState.RUNNING_STATE): logs = cursor.fetch_logs() for message in logs: print (message) # If needed, an asynchronous query can be cancelled at any time with: # cursor.cancel() status = cursor.poll().operationState #print (cursor.fetchall()) print("测试连接HIVE库,并输出结果!") conn_result = cursor.fetchall() for var in conn_result: #print(var) var1 = var[0] var2 = var[1] print("| 接口编号: %s,接口名称: %s,接口属性: %s,接口主题: %s,上传方式: %s,上传时限: %s,接口状态: %s |" % \ (var[0],var[1],var[2],var[3],var[4],var[5],var[6])) except Exception: print ('%s' % (message)) finally: cursor.close()
def get_conn(self, schema=None): db = self.get_connection(self.hiveserver2_conn_id) auth_mechanism = db.extra_dejson.get('authMechanism', 'NONE') if auth_mechanism == 'NONE' and db.login is None: # we need to give a username username = '******' kerberos_service_name = None if configuration.conf.get('core', 'security') == 'kerberos': auth_mechanism = db.extra_dejson.get('authMechanism', 'KERBEROS') kerberos_service_name = db.extra_dejson.get('kerberos_service_name', 'hive') # pyhive uses GSSAPI instead of KERBEROS as a auth_mechanism identifier if auth_mechanism == 'GSSAPI': self.log.warning( "Detected deprecated 'GSSAPI' for authMechanism " "for %s. Please use 'KERBEROS' instead", self.hiveserver2_conn_id ) auth_mechanism = 'KERBEROS' from pyhive.hive import connect return connect( host=db.host, port=db.port, auth=auth_mechanism, kerberos_service_name=kerberos_service_name, username=db.login or username, database=schema or db.schema or 'default')
def get_records(self, search_filter, column_filter, order_by, limit, offset): conn = None cursor = None try: conn = hive.connect(host=self.host, port=self.port, database=self.database) logger.log_debug("Created connection") cursor = conn.cursor() fields, fields_types = self.__get_fields_types( cursor, self.table, column_filter) query = self.__get_query(column_filter=column_filter, table=self.table, search_filter=search_filter, order_by=order_by, limit=limit, offset=offset) logger.log_info("Executing query: " + query) cursor.execute(query) rows = cursor.fetchall() values = self.__get_values( rows=rows, fields=fields, fields_types=fields_types) return {"fields": fields, "values": values, "total_count": 1} except Exception as e: logger.log_error( "Failed while fetching data from database with error : " + str(e)) raise e finally: if cursor: cursor.close() if conn: conn.close()
def test_invalid_transport_protocol(self): invalid_transport = 'invalid' self.assertRaisesRegexp( ValueError, 'Invalid thrift_transport_protocol: {}'.format(invalid_transport), lambda: hive.connect(host=_HOST, thrift_transport_protocol=invalid_transport))
def test_invalid_binary_auth(self): invalid_binary_auth = 'invalid' self.assertRaisesRegexp( NotImplementedError, 'Only NONE, NOSASL, LDAP, KERBEROS, CUSTOM authentication are supported, ' 'got {}'.format(invalid_binary_auth), lambda: hive.connect(host=_HOST, auth=invalid_binary_auth))
def get_brand_topgmv_sample(focus_brand_file, output_check_file, cat1_name, data_table): ''' 重点品牌维度下进行topGMV采样 :param focus_brand_file: :param output_check_file: :return: ''' sample_check_list = [] save_value_list = [] seed_brand_dict = get_seed_brand_info(focus_brand_file) seed_brand_list = [(k, v) for k, v in seed_brand_dict.items()] sample_brand_list = seed_brand_list[:15] extra_brand_list = seed_brand_list[15:] sample_brand_list = sample_brand_list + random.sample(extra_brand_list, 35) conn = hive.connect(host='172.20.207.6', port=10000, username='******') cur = conn.cursor() new_dt = get_new_dt(cur, data_table, cat1_name) brand_id_tmp = [ "'" + sample_item[0] + "'" for sample_item in sample_brand_list ] brand_id_str = '(' + ','.join(brand_id_tmp) + ')' try: sql1 = """ select * from ( select * ,row_number() over(partition by x.brand_std_id order by x.gmv desc)rn from ( select a.*,c.gmv from %s a left join (SELECT sku_id,max(title) title,sum(sale_amount) AS gmv FROM dwi.dwi_retailers_online_platform_info WHERE platform_type = 'pdd' AND dc = 'month' group by sku_id) c on c.sku_id = a.sku_id where a.dt = '%s' and c.gmv is not null and a.brand_std_id in %s and a.cat1_name='%s' )x )d where rn <= 60""" % (data_table, new_dt, brand_id_str, cat1_name) cur.execute(sql1) data = cur.fetchall() sample_check_list.append(data) except Exception as e: print(traceback.format_exc()) for epoch in sample_check_list: for item in epoch: save_value_list.append(item) save_value_list.insert(0, ['sku_id', 'title', 'brand_std_id', 'brand_std_name', 'match_type_name','cat1_id', 'cat1_name', 'cat2_id', 'cat2_name', \ 'cat3_id', 'cat3_name']) writeExcel(output_check_file, save_value_list, '重点品牌topGMV采样')
def db_query_date(city_name, date_value): cursor = hive.connect('localhost').cursor() query = r"SELECT * FROM weather_data WHERE location LIKE '%" + city_name + "%' AND temp_dat LIKE '%" + date_value + "%'" cursor.execute(query) all_data = cursor.fetchall() df = pd.DataFrame([[ij for ij in i] for i in all_data]) df.columns = ["location", "temp_date", "act_temp", "pred_temp"] datafr = df.reset_index().to_json(orient='records') return datafr
def _get_connection(self): host = self.configuration['host'] connection = hive.connect( host=host, port=self.configuration.get('port', None), database=self.configuration.get('database', 'default'), username=self.configuration.get('username', None), ) return connection
def _get_connection(self): host = self.configuration['host'] # if path is set but is missing initial slash, append it path = self.configuration.get('http_path', '') if path and path[0] != '/': path = '/' + path http_uri = "https://{}{}".format(host, path) transport = THttpClient.THttpClient(http_uri) password = self.configuration.get('http_password', '') auth = base64.b64encode('token:' + password) transport.setCustomHeaders({'Authorization': 'Basic ' + auth}) connection = hive.connect(thrift_transport=transport) return connection
def hive_to_df(sql=""): """ execute sql in hive and return pandas DataFrame Args: sql: sql string Returns: pandas DataFrame """ print('connecting') cursor = hive.connect('localhost').cursor() print('query start') cursor.execute(sql) cont = cursor.fetchall() cols = cursor.description col_names = [j[0] for j in cols] tmp_data = pd.DataFrame(data=cont, columns=col_names) return tmp_data
def run_query(self, query): connection = None try: connection = hive.connect(**self.configuration.to_dict()) cursor = connection.cursor() cursor.execute(query) column_names = [] columns = [] for column in cursor.description: column_name = column[COLUMN_NAME] column_names.append(column_name) columns.append({ 'name': column_name, 'friendly_name': column_name, 'type': types_map.get(column[COLUMN_TYPE], None) }) rows = [dict(zip(column_names, row)) for row in cursor] data = {'columns': columns, 'rows': rows} json_data = json.dumps(data, cls=JSONEncoder) error = None cursor.close() except KeyboardInterrupt: connection.cancel() error = "Query cancelled by user." json_data = None except Exception as e: logging.exception(e) raise sys.exc_info()[1], None, sys.exc_info()[2] finally: if connection: connection.close() return json_data, error
def test_invalid_kerberos_config(self): """kerberos_service_name should be set if and only if using KERBEROS""" self.assertRaisesRegexp(ValueError, 'kerberos_service_name.*KERBEROS', lambda: hive.connect(_HOST, kerberos_service_name='')) self.assertRaisesRegexp(ValueError, 'kerberos_service_name.*KERBEROS', lambda: hive.connect(_HOST, auth='KERBEROS'))
def connect(self): return hive.connect(host=_HOST, username='******')
def run_query(self, query, user): connection = None try: host = self.configuration['host'] if self.configuration.get('use_http', False): # default to https scheme = self.configuration.get('http_scheme', 'https') # if path is set but is missing initial slash, append it path = self.configuration.get('http_path', '') if path and path[0] != '/': path = '/' + path # if port is set prepend colon port = self.configuration.get('port', '') if port: port = ':' + port http_uri = "{}://{}{}{}".format(scheme, host, port, path) # create transport transport = THttpClient.THttpClient(http_uri) # if username or password is set, add Authorization header username = self.configuration.get('username', '') password = self.configuration.get('http_password', '') if username | password: auth = base64.b64encode(username + ':' + password) transport.setCustomHeaders({'Authorization': 'Basic ' + auth}) # create connection connection = hive.connect(thrift_transport=transport) else: connection = hive.connect( host=host, port=self.configuration.get('port', None), database=self.configuration.get('database', 'default'), username=self.configuration.get('username', None), ) cursor = connection.cursor() cursor.execute(query) column_names = [] columns = [] for column in cursor.description: column_name = column[COLUMN_NAME] column_names.append(column_name) columns.append({ 'name': column_name, 'friendly_name': column_name, 'type': types_map.get(column[COLUMN_TYPE], None) }) rows = [dict(zip(column_names, row)) for row in cursor] data = {'columns': columns, 'rows': rows} json_data = json.dumps(data, cls=JSONEncoder) error = None except KeyboardInterrupt: connection.cancel() error = "Query cancelled by user." json_data = None finally: if connection: connection.close() return json_data, error
def test_invalid_ldap_config(self): """password should be set if and only if using LDAP""" self.assertRaisesRegexp(ValueError, 'password.*LDAP', lambda: hive.connect(_HOST, password='')) self.assertRaisesRegexp(ValueError, 'password.*LDAP', lambda: hive.connect(_HOST, auth='LDAP'))
# -*- coding: utf-8 -*- from pyhive import hive host = '0.0.0.0' port = 10000 hiveConn = hive.connect(host=host, port=port) cursor = hiveConn.cursor() query = 'select * from table' cursor.execute(query) aa = cursor.fetchall() print len(aa) hiveConn.close()
def __init__(self, hostname='localhost', port=10000, schema='default', username='******', **kwargs): basic_conf = {'hive.cli.print.header' : 'false'} self.conn = llap.connect(host=hostname, port=int(port), username=username, database=schema, configuration=basic_conf)
def connect(self): return hive.connect(host=_HOST, configuration={'mapred.job.tracker': 'local'})