Beispiel #1
0
    def test_thrift_http_auth_none(self):
        rootdir = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
        orig_http = os.path.join(rootdir, 'scripts', 'travis-conf', 'hive',
                                 'hive-site-http-none.xml')
        orig_none = os.path.join(rootdir, 'scripts', 'travis-conf', 'hive',
                                 'hive-site.xml')
        des = os.path.join('/', 'etc', 'hive', 'conf', 'hive-site.xml')
        try:
            subprocess.check_call(['sudo', 'cp', orig_http, des])
            _restart_hs2(10001)

            with contextlib.closing(
                    hive.connect(host=_HOST,
                                 username='******',
                                 thrift_transport_protocol='http',
                                 auth='NONE',
                                 http_path='/')) as connection:
                with contextlib.closing(connection.cursor()) as cursor:
                    cursor.execute('SELECT * FROM one_row')
                    self.assertEqual(cursor.fetchall(), [(1, )])

            with contextlib.closing(
                    hive.connect(
                        host=_HOST,
                        thrift_transport_protocol='http')) as connection:
                with contextlib.closing(connection.cursor()) as cursor:
                    cursor.execute('SELECT * FROM one_row')
                    self.assertEqual(cursor.fetchall(), [(1, )])

        finally:
            subprocess.check_call(['sudo', 'cp', orig_none, des])
            _restart_hs2()
Beispiel #2
0
    def test_ldap_connection(self):
        rootdir = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
        orig_ldap = os.path.join(rootdir, 'scripts', 'travis-conf', 'hive', 'hive-site-ldap.xml')
        orig_none = os.path.join(rootdir, 'scripts', 'travis-conf', 'hive', 'hive-site.xml')
        des = os.path.join('/', 'etc', 'hive', 'conf', 'hive-site.xml')
        try:
            subprocess.check_call(['sudo', 'cp', orig_ldap, des])
            subprocess.check_call(['sudo', 'service', 'hive-server2', 'restart'])
            time.sleep(10)
            with contextlib.closing(hive.connect(
                host=_HOST, username='******', auth='LDAP', password='******')
            ) as connection:
                with contextlib.closing(connection.cursor()) as cursor:
                    cursor.execute('SELECT * FROM one_row')
                    self.assertEqual(cursor.fetchall(), [(1,)])

            self.assertRaisesRegexp(
                TTransportException, 'Error validating the login',
                lambda: hive.connect(
                    host=_HOST, username='******', auth='LDAP', password='******')
            )

        finally:
            subprocess.check_call(['sudo', 'cp', orig_none, des])
            subprocess.check_call(['sudo', 'service', 'hive-server2', 'restart'])
            time.sleep(10)
Beispiel #3
0
 def test_invalid_kerberos_config(self):
     """kerberos_service_name should be set if and only if using KERBEROS"""
     self.assertRaisesRegexp(
         ValueError, 'kerberos_service_name.*KERBEROS',
         lambda: hive.connect(_HOST, kerberos_service_name=''))
     self.assertRaisesRegexp(ValueError, 'kerberos_service_name.*KERBEROS',
                             lambda: hive.connect(_HOST, auth='KERBEROS'))
Beispiel #4
0
    def test_custom_connection(self):
        rootdir = os.path.dirname(os.path.dirname(os.path.dirname(__file__)))
        orig_ldap = os.path.join(rootdir, 'scripts', 'travis-conf', 'hive',
                                 'hive-site-custom.xml')
        orig_none = os.path.join(rootdir, 'scripts', 'travis-conf', 'hive',
                                 'hive-site.xml')
        des = os.path.join('/', 'etc', 'hive', 'conf', 'hive-site.xml')
        try:
            subprocess.check_call(['sudo', 'cp', orig_ldap, des])
            _restart_hs2()
            with contextlib.closing(
                    hive.connect(host=_HOST,
                                 username='******',
                                 auth='CUSTOM',
                                 password='******')) as connection:
                with contextlib.closing(connection.cursor()) as cursor:
                    cursor.execute('SELECT * FROM one_row')
                    self.assertEqual(cursor.fetchall(), [(1, )])

            self.assertRaisesRegexp(
                TTransportException, 'Error validating the login',
                lambda: hive.connect(host=_HOST,
                                     username='******',
                                     auth='CUSTOM',
                                     password='******'))

        finally:
            subprocess.check_call(['sudo', 'cp', orig_none, des])
            _restart_hs2()
Beispiel #5
0
    def open(cls, connection):
        if connection.state == 'open':
            logger.debug('Connection is already open, skipping open.')
            return connection

        creds = connection.credentials
        connect_retries = creds.get('connect_retries', 0)
        connect_timeout = creds.get('connect_timeout', 10)

        exc = None
        for i in range(1 + connect_retries):
            try:
                if creds.method == 'http':

                    cls.validate_creds(
                        creds,
                        ['token', 'host', 'port', 'cluster', 'organization'])

                    conn_url = SPARK_CONNECTION_URL.format(**creds)
                    transport = THttpClient.THttpClient(conn_url)

                    raw_token = "token:{}".format(creds.token).encode()
                    token = base64.standard_b64encode(raw_token).decode()
                    transport.setCustomHeaders(
                        {'Authorization': 'Basic {}'.format(token)})

                    conn = hive.connect(thrift_transport=transport)
                elif creds.method == 'thrift':
                    cls.validate_creds(creds, ['host'])

                    conn = hive.connect(host=creds.host,
                                        port=creds.get('port'),
                                        username=creds.get('user'))
                break
            except Exception as e:
                exc = e
                if getattr(e, 'message', None) is None:
                    raise

                message = e.message.lower()
                is_pending = 'pending' in message
                is_starting = 'temporarily_unavailable' in message

                warning = "Warning: {}\n\tRetrying in {} seconds ({} of {})"
                if is_pending or is_starting:
                    logger.warning(
                        warning.format(e.message, connect_timeout, i + 1,
                                       connect_retries))
                    time.sleep(connect_timeout)
                else:
                    raise
        else:
            raise exc

        wrapped = ConnectionWrapper(conn)

        connection.state = 'open'
        connection.handle = wrapped
        return connection
Beispiel #6
0
    def test_invalid_http_basic_auth(self):
        self.assertRaisesRegexp(
            ValueError, 'BASIC authentication requires password.',
            lambda: hive.connect(
                host=_HOST, thrift_transport_protocol='http', auth='BASIC'))

        self.assertRaisesRegexp(
            ValueError, 'BASIC authentication requires password.',
            lambda: hive.connect(host=_HOST,
                                 thrift_transport_protocol='http',
                                 auth='BASIC',
                                 username='******'))
Beispiel #7
0
    def execute(self, quals, columns):
        if self.query:
            statement = self.query
        else:
            statement = "SELECT " + ",".join(self.columns.keys()) + " FROM " + self.table
        
        log_to_postgres('Hive query: ' + unicode(statement), DEBUG)
        
        try:
            client = hive.connect(self.host ,username='******', port=self.port)

            cursor = client.cursor()

            cursor.execute(statement) 
            
            for row in cursor.fetchall():
                line = {}
                idx = 0
                for column_name in self.columns:
                    line[column_name] = row[idx]
                    idx = idx + 1
                yield line
                
        except NotImplementedError, ix:
            log_to_postgres(ix.message, ERROR)
Beispiel #8
0
    def load_data(self, path, partition_val):
        """
            Loads the file with the specified path in STDIN to Hive

            Parameters
            ----------
            path: <string>
                path specified in STDIN
            partition_val: <list>
                partition specified in STDIN

            Note
            ----
            The query to execute is carved in this code.
            Fix it a little as needed.
        """
        partition_lst = []
        for x in range(len(self.partitions)):
            partition_lst.append("%s='%s'" %
                                 (self.partitions[x], partition_val[x]))
        partition_str = ','.join(lst)

        query = "LOAD DATA INPATH '%s' \
                 OVERWRITE INTO TABLE %s \
                 PARTITION(%s)" % (path, self.table, partition_str)
        __LOG__.Trace(query)
        try:
            self.cursor = hive.connect(**self.conn_info).cursor()
            self.cursor.execute(query)
        except Exception, ex:
            __LOG__.Trace(ex)
Beispiel #9
0
def get_jd_data_hive():
    '''
    从hive中取出来的数据类型为list,以及每一项都为一个tuple,每一个单元格的数据都为tuple中的一个元素.
    2235063
    <class 'list'>
    ('62456977355',)
    :return:
    '''
    try:
        idx = 0
        t0 = time.time()
        conn = hive.connect(host='172.20.207.6', port=10000, username='******')
        # conn = connect(host='172.20.207.6', port=10000, auth_mechanism="PLAIN")
        cur = conn.cursor()
        sql = "select spu_id from dim.dim_retailers_online_spu_sku where platform_type = 'jd'"
        cur.execute(sql)
        data = cur.fetchall()
        print("finish getting data!")
        ret_data = list_of_groups(data,24000000)
        print("hive get data finish!")
        for ret_item in ret_data:
            idx += 1
            with open("./data/output_data_" + str(idx) + ".txt","w",encoding="utf-8") as f1:
                for item in ret_item:
                    f1.write(item[0] + "\n")
                f1.flush()

        cur.close()
        conn.close()
        t1 = time.time()
        return data

    except Exception as e:
        print(traceback.format_exc())
Beispiel #10
0
def hiveQuery(SQL):
    os.system("kinit bigf_admin -kt /etc/bigf.keytab")
    cursor = hive.connect(host=hiveHost,auth="KERBEROS",kerberos_service_name="hive").cursor()
    cursor.execute("add jar /opt/cloudera/parcels/CDH/jars/hive-contrib-1.1.0-cdh5.12.0.jar")
    cursor.execute(SQL)
    bb = cursor.fetchall()
    return bb
Beispiel #11
0
    def connInstance(self,
                     host=None,
                     port=10000,
                     username=None,
                     schema='default',
                     auth=None,
                     password=None):

        i = 0
        while i < TRY_CONNECT_TIMES:
            try:
                setattr(
                    self, self.conn_key,
                    hive.connect(host=host,
                                 port=port,
                                 username=username,
                                 database=schema,
                                 auth=auth,
                                 password=password).cursor())
                break
            except Exception, e:
                output('hive Exception ' + str(e), logType='hive')
            j = 60 if i >= 4 else i * random.randint(1, 5)
            # 3次连接不上则发送警告,但不终止,继续尝试连接
            if i == 10 or i == 50:
                _msg = "Can't connect hive %s@%s %s times" % (
                    self.conn_key[5:], _binname, i)
                output(_msg, logType='hive')
                notice_me(_msg)
            time.sleep(j)
            i += 1
def get_change_handle(table_id, table, target_database):
    if metadata.get_change_ddl(table_id):
        cursor = hive.connect(host=config.hiveserver2,
                              username=config.hive_user,
                              port=config.hive_port).cursor()
        cursor.execute(const.DROP_SQL % (target_database, table))
        cursor.close()
def create_mask_table(target_database, target_table, field, datatype,
                      partition_key, table_comment, field_comment):
    content = ""
    i = 0
    """
    野蛮粗暴用string类型,避免数据类型冲突问题。
    (例如:bigint类型的身份证号,脱敏之后存储到bigint字段,会造成查询只有null)
    """
    for item in field:
        if i == 0:
            content += item + " string COMMENT '" + field_comment[i] + "'"
        else:
            content += "," + item + " string COMMENT '" + field_comment[i] + "'"
        i += 1
    partition = partition_key + " string"
    args = {
        "database": target_database,
        "table": target_table,
        "content": content,
        "partition": partition,
        "comment": table_comment
    }
    template = Template(mask_const.CREATE_MASK_TABLE)
    cursor = hive.connect(host=config.hiveserver2,
                          username=config.hive_user,
                          port=config.hive_port).cursor()
    cursor.execute(template.substitute(args))
    cursor.close()
Beispiel #14
0
def get_sku_10w(output_check_file, cat1_name, data_table):
    sample_check_list = []
    save_value_list = []
    conn = hive.connect(host='172.20.207.6', port=10000, username='******')
    cur = conn.cursor()
    new_dt = get_new_dt(cur, data_table, cat1_name)
    try:
        sql_str = """select * from %s a
        left join dwi.dwi_retailers_online_platform_info_pdd_10w
        c on c.sku_id = a.sku_id
        where a.dt = '%s' and c.sku_id is not null and a.cat1_name='%s'""" % (
            data_table, new_dt, cat1_name)

        cur.execute(sql_str)
        data_tuple = cur.fetchall()
        sample_check_list.append(data_tuple)
    except Exception as e:
        print(traceback.format_exc())

    for epoch in sample_check_list:
        for item in epoch:
            save_value_list.append(item)

    save_value_list.insert(0, ['sku_id', 'title', 'brand_std_id', 'brand_std_name', 'match_type_name', 'cat1_id',
                               'cat1_name', 'cat2_id',
                               'cat2_name', \
                               'cat3_id', 'cat3_name'])
    writeExcel(output_check_file, save_value_list, '10w+商品')
Beispiel #15
0
def create_hive_parq_table():
    cursor = hive.connect('localhost').cursor()
    sql = '''
    create external table example_parq(one double, two string, three boolean)
        STORED AS PARQUET location 's3a://example-parquet/'
    '''
    cursor.execute(sql)
Beispiel #16
0
def hiveconnection(inSql):
    if (kerberos_enabled):
        auth = "KERBEROS"
        kerberos_service_name = "hive"
        password = None
    else:
        password = api.config.password
        auth = 'CUSTOM'
        kerberos_service_name = None

    if (http_enabled):
        conn = hive.connect(thrift_transport=add_http_mode_support())
    else:
        conn = hive.Connection(host=hostname,
                               port=port,
                               username=user,
                               password=password,
                               database=database,
                               auth=auth,
                               kerberos_service_name=kerberos_service_name)

    cur = conn.cursor()
    cur.execute(inSql)
    resultList = cur.fetchall()

    string = ""
    for x in resultList:
        for y in x:
            string = string + str(
                y
            ) + api.config.delimiter  ## Delimiter to separate Hive columns in output
        string = string + "\n"

    api.send("output", string)
Beispiel #17
0
    def get_conn(self, schema=None):
        """
        Returns a Hive connection object.
        """
        db = self.get_connection(self.hiveserver2_conn_id)  # pylint: disable=no-member
        auth_mechanism = db.extra_dejson.get('authMechanism', 'NONE')
        if auth_mechanism == 'NONE' and db.login is None:
            # we need to give a username
            username = '******'
        kerberos_service_name = None
        if conf.get('core', 'security') == 'kerberos':
            auth_mechanism = db.extra_dejson.get('authMechanism', 'KERBEROS')
            kerberos_service_name = db.extra_dejson.get(
                'kerberos_service_name', 'hive')

        # pyhive uses GSSAPI instead of KERBEROS as a auth_mechanism identifier
        if auth_mechanism == 'GSSAPI':
            self.log.warning(
                "Detected deprecated 'GSSAPI' for authMechanism "
                "for %s. Please use 'KERBEROS' instead",
                self.hiveserver2_conn_id  # pylint: disable=no-member
            )
            auth_mechanism = 'KERBEROS'

        from pyhive.hive import connect
        return connect(host=db.host,
                       port=db.port,
                       auth=auth_mechanism,
                       kerberos_service_name=kerberos_service_name,
                       username=db.login or username,
                       password=db.password,
                       database=schema or db.schema or 'default')
Beispiel #18
0
    def process_data(self, file_location):
        result = 'ok'
        try:
            cursor = hive.connect(self.ip,
                                  port=self.port,
                                  username=self.hive_user_name,
                                  database=self.database
                                  ).cursor()
            new_file_location = file_location + '.COMPLETE'
            shutil.move(file_location, new_file_location)
            table = self.get_table(file_location)
            assert table is not None
            LOAD_HSQL = "LOAD DATA LOCAL INPATH '%s' INTO TABLE %s" % (new_file_location, table)
            self.log.debug(LOAD_HSQL)
            cursor.execute(LOAD_HSQL)
        except:
            self.log.warning(traceback.format_exc())
            result = 'Fail'
        finally:
            cursor.close()

        if 'ok' == result:
            os.remove(new_file_location)
            self.log.info(new_file_location+' is deleted.')

        return result,
Beispiel #19
0
 def test_invalid_transport(self):
     """transport and auth are incompatible"""
     socket = thrift.transport.TSocket.TSocket('localhost', 10000)
     transport = thrift.transport.TTransport.TBufferedTransport(socket)
     self.assertRaisesRegexp(
         ValueError, 'thrift_transport cannot be used with',
         lambda: hive.connect(_HOST, thrift_transport=transport))
Beispiel #20
0
    def _get_connection(self):
        host = self.configuration['host']

        scheme = self.configuration.get('http_scheme', 'https')

        # if path is set but is missing initial slash, append it
        path = self.configuration.get('http_path', '')
        if path and path[0] != '/':
            path = '/' + path

        # if port is set prepend colon
        port = self.configuration.get('port', '')
        if port:
            port = ':' + str(port)

        http_uri = "{}://{}{}{}".format(scheme, host, port, path)

        # create transport
        transport = THttpClient.THttpClient(http_uri)

        # if username or password is set, add Authorization header
        username = self.configuration.get('username', '')
        password = self.configuration.get('http_password', '')
        if username or password:
            auth = base64.b64encode(username + ':' + password)
            transport.setCustomHeaders({'Authorization': 'Basic ' + auth})

        # create connection
        connection = hive.connect(thrift_transport=transport)
        
        return connection
Beispiel #21
0
    def _get_connection(self):
        host = self.configuration["host"]

        scheme = self.configuration.get("http_scheme", "https")

        # if path is set but is missing initial slash, append it
        path = self.configuration.get("http_path", "")
        if path and path[0] != "/":
            path = "/" + path

        # if port is set prepend colon
        port = self.configuration.get("port", "")
        if port:
            port = ":" + str(port)

        http_uri = "{}://{}{}{}".format(scheme, host, port, path)

        # create transport
        transport = THttpClient.THttpClient(http_uri)

        # if username or password is set, add Authorization header
        username = self.configuration.get("username", "")
        password = self.configuration.get("http_password", "")
        if username or password:
            auth = base64.b64encode(
                username.encode("ascii") + b":" + password.encode("ascii"))
            transport.setCustomHeaders(
                {"Authorization": "Basic " + auth.decode()})

        # create connection
        connection = hive.connect(thrift_transport=transport)

        return connection
    def __init__(self,
                 connection_string=None,
                 username=None,
                 password=None,
                 proxy_user=None,
                 impersonate=False,
                 *args,
                 **kwargs):
        with Timeout(120, "Timeout connecting to HiveServer"):
            connection_conf = get_hive_connection_conf(connection_string)

            port = 10000 if not connection_conf.port else connection_conf.port
            configuration = dict(connection_conf.configuration)
            configuration["mapred.job.queue.name"] = "root.dev-test"
            if proxy_user and impersonate:
                configuration["hive.server2.proxy.user"] = proxy_user
                configuration[
                    "mapred.job.queue.name"] = "root.users.%s" % proxy_user
            self._connection = hive.connect(
                host=connection_conf.host,
                port=port,
                database=connection_conf.default_db,
                auth="LDAP",
                username=username,
                password=password,
                configuration=configuration,
            )
        super(HiveClient, self).__init__()
Beispiel #23
0
    def execute_query(self, query, data=None):
        """Run a SELECT statement.

        Args:
            query: The SELECT statement to be executed
            data[Optional]: The data to be used for parametrized query

        Returns:
            Returns the result as pandas dataframe
        """
        try:
            with contextlib.closing(
                    hive.connect(host=self.__host,
                                 username=self.__username)) as conn:
                with contextlib.closing(conn.cursor()) as cursor:
                    cursor.execute(query, data)
                    result = True
                    # In case of dml this is -1
                    if cursor.rowcount != -1:
                        columns = cursor.description
                        result = \
                            [{columns[index][0]:column for
                              index, column in enumerate(value)}
                             for value in cursor.fetchall()]

#            bucket_name = os.environ["AWS_ATHENA_S3_STAGING_DIR"]
#            s3_client = boto3.client('s3')
#            # Remove the s3:// part from bucket name
#            obj = s3_client.get_object(Bucket=bucket_name[5:], Key=result_file)
#            df = pd.read_csv(io.BytesIO(obj['Body'].read()), encoding='utf8')

        except Exception as ex:
            raise (ex)
        return result
Beispiel #24
0
def pyhiveexesql(sql):
    print(sql)
    cursor = None
    try:
        cursor = hive.connect(host='', port=10000, username='').cursor()
        cursor.execute(sql, async=True)
        status = cursor.poll().operationState
        while status in (TOperationState.INITIALIZED_STATE,
                         TOperationState.RUNNING_STATE):
            logs = cursor.fetch_logs()
            for message in logs:
                print(message)
                # If needed, an asynchronous query can be cancelled at any time with:
                # cursor.cancel()
                status = cursor.poll().operationState
        #print (cursor.fetchall())
        print("测试连接HIVE库,并输出结果!")
        conn_result = cursor.fetchall()
        var_len = len(conn_result)
        #print(var_len,conn_result)
        var = 0
        while var < var_len:
            print(conn_result[var])
            var += 1

    except Exception:
        print('%s' % (message))
    finally:
        cursor.close()
Beispiel #25
0
def get_random_sample(output_file, cat1_name, data_table):
    '''
    所有商品下的随机采样
    :param output_file:
    :return:
    '''

    conn = hive.connect(host='172.20.207.6', port=10000, username='******')
    cur = conn.cursor()

    new_dt = get_new_dt(cur, data_table, cat1_name)

    sku_count = get_table_count(cur, data_table, cat1_name)

    no_dict = {}
    sample_check_list = []
    save_value_list = []

    while True:
        tmp = random.randint(0, sku_count)
        if tmp not in no_dict:
            no_dict[tmp] = ''
        if len(no_dict) >= 3000:
            break
    r_lst = []
    for k, v in no_dict.items():
        r_lst.append(str(k))

    r_lst_tmp = ["'" + str(item) + "'" for item in r_lst]
    where_cond = "(" + ", ".join(r_lst_tmp) + ")"

    try:
        sql_str = """
        select
        x.sku_id, x.title, x.brand_std_id, x.brand_std_name, x.cat1_std_id, x.cat1_std_name, x.cat2_std_id, x.cat2_std_name, x.cat3_std_id, x.cat3_std_name
        from
        (
            select row_number()
        over(partition
        by
        1) as rw_no, sku_id, title, brand_std_id, brand_std_name, cat1_std_id, cat1_std_name, cat2_std_id, cat2_std_name, cat3_std_id, cat3_std_name
        from dwd.dwd_pdd_cat3_brand_reg
            where
        cat1_name = '%s' and dt = '%s'
        ) x
        where rw_no in %s""" % (cat1_name, new_dt, where_cond)
        cur.execute(sql_str)
        data = cur.fetchall()
        sample_check_list.append(data)
    except Exception as e:
        print(traceback.format_exc())

    for epoch in sample_check_list:
        for item in epoch:
            save_value_list.append(item)

    save_value_list.insert(0,['sku_id','title','brand_std_id','brand_std_name','cat1_id','cat1_name','cat2_id','cat2_name',\
                              'cat3_id','cat3_name'])

    writeExcel(output_file, save_value_list, '全局随机采样')
Beispiel #26
0
def create_hive_tables(bc, dir_data_lc, fileSchemaType, **kwargs):
    tables = kwargs.get("tables", tpchTables)
    for i, table in enumerate(tables):
        cursor = hive.connect("172.22.0.3").cursor()
        table = bc.create_table(table, cursor)
        # table = bc.create_table(table, cursor, file_format=fileSchemaType)
        print(table)
Beispiel #27
0
 def connect_to_database(self, name, details=None):
     if details:
         self.registry[name] = details
     else:
         details = self.registry[name]
     dbtype = details["type"]
     creds = details["creds"]
     if dbtype == "mysql":
         import pymysql
         self.conns[name] = pymysql.connect(local_infile=True, **creds)
         self.uris[
             name] = 'mysql+pymysql://{user}:{password}@{host}:{port}/{db}'.format(
                 **creds)
         self.engines[name] = create_engine(self.uris[name])
     elif dbtype in ["postgres", "redshift"]:
         import psycopg2
         self.conns[name] = psycopg2.connect(**creds)
     elif dbtype == "presto":
         from pyhive import presto
         self.conns[name] = presto.connect(**creds)
         self.uris[
             name] = 'presto://{username}@{host}:{port}/hive/default'.format(
                 **creds)
         self.engines[name] = create_engine(self.uris[name])
         # engines[name] = create_engine('presto://', creator=lambda: conns[name])
     elif dbtype == "hive":
         from pyhive import hive
         self.conns[name] = hive.connect(**creds)
Beispiel #28
0
def pyhiveexesql(sql):
    print (sql)
    cursor = None
    try:
       cursor = hive.connect(host='', port=10000, username='').cursor()
       cursor.execute(sql, async=True)
       status = cursor.poll().operationState
       while status in (TOperationState.INITIALIZED_STATE, TOperationState.RUNNING_STATE):
          logs = cursor.fetch_logs()
          for message in logs:
             print (message)
             # If needed, an asynchronous query can be cancelled at any time with:
             # cursor.cancel()
             status = cursor.poll().operationState
       #print (cursor.fetchall())
       print("测试连接HIVE库,并输出结果!")
       conn_result = cursor.fetchall()
       for var in conn_result:
           #print(var)
           var1 = var[0]
           var2 = var[1]
           print("|  接口编号: %s,接口名称: %s,接口属性: %s,接口主题: %s,上传方式: %s,上传时限: %s,接口状态: %s   |" % \
                 (var[0],var[1],var[2],var[3],var[4],var[5],var[6]))

    except Exception:
        print ('%s' % (message))
    finally:
        cursor.close()
Beispiel #29
0
    def get_conn(self, schema=None):
        db = self.get_connection(self.hiveserver2_conn_id)
        auth_mechanism = db.extra_dejson.get('authMechanism', 'NONE')
        if auth_mechanism == 'NONE' and db.login is None:
            # we need to give a username
            username = '******'
        kerberos_service_name = None
        if configuration.conf.get('core', 'security') == 'kerberos':
            auth_mechanism = db.extra_dejson.get('authMechanism', 'KERBEROS')
            kerberos_service_name = db.extra_dejson.get('kerberos_service_name', 'hive')

        # pyhive uses GSSAPI instead of KERBEROS as a auth_mechanism identifier
        if auth_mechanism == 'GSSAPI':
            self.log.warning(
                "Detected deprecated 'GSSAPI' for authMechanism "
                "for %s. Please use 'KERBEROS' instead",
                self.hiveserver2_conn_id
            )
            auth_mechanism = 'KERBEROS'

        from pyhive.hive import connect
        return connect(
            host=db.host,
            port=db.port,
            auth=auth_mechanism,
            kerberos_service_name=kerberos_service_name,
            username=db.login or username,
            database=schema or db.schema or 'default')
Beispiel #30
0
    def get_records(self, search_filter, column_filter, order_by, limit, offset):
        conn = None
        cursor = None
        try:
            conn = hive.connect(host=self.host, port=self.port,
                                database=self.database)
            logger.log_debug("Created connection")
            cursor = conn.cursor()

            fields, fields_types = self.__get_fields_types(
                cursor, self.table, column_filter)

            query = self.__get_query(column_filter=column_filter, table=self.table,
                                     search_filter=search_filter, order_by=order_by, limit=limit, offset=offset)
            logger.log_info("Executing query: " + query)
            cursor.execute(query)
            rows = cursor.fetchall()

            values = self.__get_values(
                rows=rows, fields=fields, fields_types=fields_types)

            return {"fields": fields, "values": values, "total_count": 1}
        except Exception as e:
            logger.log_error(
                "Failed while fetching data from database with error : " + str(e))
            raise e
        finally:
            if cursor:
                cursor.close()
            if conn:
                conn.close()
Beispiel #31
0
 def test_invalid_transport_protocol(self):
     invalid_transport = 'invalid'
     self.assertRaisesRegexp(
         ValueError,
         'Invalid thrift_transport_protocol: {}'.format(invalid_transport),
         lambda: hive.connect(host=_HOST,
                              thrift_transport_protocol=invalid_transport))
Beispiel #32
0
 def test_invalid_binary_auth(self):
     invalid_binary_auth = 'invalid'
     self.assertRaisesRegexp(
         NotImplementedError,
         'Only NONE, NOSASL, LDAP, KERBEROS, CUSTOM authentication are supported, '
         'got {}'.format(invalid_binary_auth),
         lambda: hive.connect(host=_HOST, auth=invalid_binary_auth))
Beispiel #33
0
    def _get_connection(self):
        host = self.configuration['host']

        scheme = self.configuration.get('http_scheme', 'https')

        # if path is set but is missing initial slash, append it
        path = self.configuration.get('http_path', '')
        if path and path[0] != '/':
            path = '/' + path

        # if port is set prepend colon
        port = self.configuration.get('port', '')
        if port:
            port = ':' + str(port)

        http_uri = "{}://{}{}{}".format(scheme, host, port, path)

        # create transport
        transport = THttpClient.THttpClient(http_uri)

        # if username or password is set, add Authorization header
        username = self.configuration.get('username', '')
        password = self.configuration.get('http_password', '')
        if username or password:
            auth = base64.b64encode(username + ':' + password)
            transport.setCustomHeaders({'Authorization': 'Basic ' + auth})

        # create connection
        connection = hive.connect(thrift_transport=transport)

        return connection
Beispiel #34
0
def get_brand_topgmv_sample(focus_brand_file, output_check_file, cat1_name,
                            data_table):
    '''
    重点品牌维度下进行topGMV采样
    :param focus_brand_file:
    :param output_check_file:
    :return:
    '''
    sample_check_list = []
    save_value_list = []

    seed_brand_dict = get_seed_brand_info(focus_brand_file)
    seed_brand_list = [(k, v) for k, v in seed_brand_dict.items()]
    sample_brand_list = seed_brand_list[:15]
    extra_brand_list = seed_brand_list[15:]
    sample_brand_list = sample_brand_list + random.sample(extra_brand_list, 35)

    conn = hive.connect(host='172.20.207.6', port=10000, username='******')
    cur = conn.cursor()
    new_dt = get_new_dt(cur, data_table, cat1_name)

    brand_id_tmp = [
        "'" + sample_item[0] + "'" for sample_item in sample_brand_list
    ]
    brand_id_str = '(' + ','.join(brand_id_tmp) + ')'

    try:
        sql1 = """
select * 
  from 
  (
    select * ,row_number() over(partition by x.brand_std_id order by x.gmv desc)rn
     from 
        (
          select a.*,c.gmv from %s a
          left join (SELECT sku_id,max(title) title,sum(sale_amount) AS gmv
            FROM dwi.dwi_retailers_online_platform_info
            WHERE platform_type = 'pdd'
            AND dc = 'month'
            group by sku_id) c on c.sku_id = a.sku_id
            where a.dt = '%s' and c.gmv is not null and a.brand_std_id in %s and a.cat1_name='%s'
        )x
  )d
where rn <= 60""" % (data_table, new_dt, brand_id_str, cat1_name)
        cur.execute(sql1)
        data = cur.fetchall()
        sample_check_list.append(data)
    except Exception as e:
        print(traceback.format_exc())

    for epoch in sample_check_list:
        for item in epoch:
            save_value_list.append(item)

    save_value_list.insert(0, ['sku_id', 'title', 'brand_std_id', 'brand_std_name', 'match_type_name','cat1_id', 'cat1_name', 'cat2_id',
                               'cat2_name', \
                               'cat3_id', 'cat3_name'])

    writeExcel(output_check_file, save_value_list, '重点品牌topGMV采样')
Beispiel #35
0
def db_query_date(city_name, date_value):
	cursor = hive.connect('localhost').cursor()
	query = r"SELECT * FROM weather_data WHERE location LIKE '%" + city_name + "%' AND temp_dat LIKE '%" + date_value + "%'"
	cursor.execute(query)
	all_data = cursor.fetchall()
	df = pd.DataFrame([[ij for ij in i] for i in all_data])
	df.columns = ["location", "temp_date", "act_temp", "pred_temp"]
	datafr = df.reset_index().to_json(orient='records')
	return datafr
Beispiel #36
0
    def _get_connection(self):
        host = self.configuration['host']

        connection = hive.connect(
            host=host,
            port=self.configuration.get('port', None),
            database=self.configuration.get('database', 'default'),
            username=self.configuration.get('username', None),
        )
        
        return connection
Beispiel #37
0
    def _get_connection(self):
        host = self.configuration['host']

        # if path is set but is missing initial slash, append it
        path = self.configuration.get('http_path', '')
        if path and path[0] != '/':
            path = '/' + path

        http_uri = "https://{}{}".format(host, path)

        transport = THttpClient.THttpClient(http_uri)

        password = self.configuration.get('http_password', '')
        auth = base64.b64encode('token:' + password)
        transport.setCustomHeaders({'Authorization': 'Basic ' + auth})

        connection = hive.connect(thrift_transport=transport)
        return connection
Beispiel #38
0
def hive_to_df(sql=""):
    """
    execute sql in hive and return pandas DataFrame
    Args:
        sql: sql string

    Returns: pandas DataFrame

    """
    print('connecting')
    cursor = hive.connect('localhost').cursor()
    print('query start')
    cursor.execute(sql)
    cont = cursor.fetchall()
    cols = cursor.description
    col_names = [j[0] for j in cols]
    tmp_data = pd.DataFrame(data=cont, columns=col_names)
    return tmp_data
Beispiel #39
0
    def run_query(self, query):

        connection = None
        try:
            connection = hive.connect(**self.configuration.to_dict())

            cursor = connection.cursor()

            cursor.execute(query)

            column_names = []
            columns = []

            for column in cursor.description:
                column_name = column[COLUMN_NAME]
                column_names.append(column_name)

                columns.append({
                    'name': column_name,
                    'friendly_name': column_name,
                    'type': types_map.get(column[COLUMN_TYPE], None)
                })

            rows = [dict(zip(column_names, row)) for row in cursor]

            data = {'columns': columns, 'rows': rows}
            json_data = json.dumps(data, cls=JSONEncoder)
            error = None
            cursor.close()
        except KeyboardInterrupt:
            connection.cancel()
            error = "Query cancelled by user."
            json_data = None
        except Exception as e:
            logging.exception(e)
            raise sys.exc_info()[1], None, sys.exc_info()[2]
        finally:
            if connection:
                connection.close()

        return json_data, error
Beispiel #40
0
 def test_invalid_kerberos_config(self):
     """kerberos_service_name should be set if and only if using KERBEROS"""
     self.assertRaisesRegexp(ValueError, 'kerberos_service_name.*KERBEROS',
                             lambda: hive.connect(_HOST, kerberos_service_name=''))
     self.assertRaisesRegexp(ValueError, 'kerberos_service_name.*KERBEROS',
                             lambda: hive.connect(_HOST, auth='KERBEROS'))
Beispiel #41
0
 def connect(self):
     return hive.connect(host=_HOST, username='******')
Beispiel #42
0
    def run_query(self, query, user):

        connection = None
        try:
            host = self.configuration['host']

            if self.configuration.get('use_http', False):
                # default to https
                scheme = self.configuration.get('http_scheme', 'https')

                # if path is set but is missing initial slash, append it
                path = self.configuration.get('http_path', '')
                if path and path[0] != '/':
                    path = '/' + path

                # if port is set prepend colon
                port = self.configuration.get('port', '')
                if port:
                    port = ':' + port

                http_uri = "{}://{}{}{}".format(scheme, host, port, path)

                # create transport
                transport = THttpClient.THttpClient(http_uri)

                # if username or password is set, add Authorization header
                username = self.configuration.get('username', '')
                password = self.configuration.get('http_password', '')
                if username | password:
                    auth = base64.b64encode(username + ':' + password)
                    transport.setCustomHeaders({'Authorization': 'Basic ' + auth})

                # create connection
                connection = hive.connect(thrift_transport=transport)
            else:
                connection = hive.connect(
                    host=host,
                    port=self.configuration.get('port', None),
                    database=self.configuration.get('database', 'default'),
                    username=self.configuration.get('username', None),
                )

            cursor = connection.cursor()

            cursor.execute(query)

            column_names = []
            columns = []

            for column in cursor.description:
                column_name = column[COLUMN_NAME]
                column_names.append(column_name)

                columns.append({
                    'name': column_name,
                    'friendly_name': column_name,
                    'type': types_map.get(column[COLUMN_TYPE], None)
                })

            rows = [dict(zip(column_names, row)) for row in cursor]

            data = {'columns': columns, 'rows': rows}
            json_data = json.dumps(data, cls=JSONEncoder)
            error = None
        except KeyboardInterrupt:
            connection.cancel()
            error = "Query cancelled by user."
            json_data = None
        finally:
            if connection:
                connection.close()

        return json_data, error
Beispiel #43
0
 def test_invalid_ldap_config(self):
     """password should be set if and only if using LDAP"""
     self.assertRaisesRegexp(ValueError, 'password.*LDAP',
                             lambda: hive.connect(_HOST, password=''))
     self.assertRaisesRegexp(ValueError, 'password.*LDAP',
                             lambda: hive.connect(_HOST, auth='LDAP'))
# -*- coding: utf-8 -*-


from pyhive import hive
host = '0.0.0.0'
port = 10000

hiveConn = hive.connect(host=host, port=port)
cursor = hiveConn.cursor()

query = 'select * from table'
cursor.execute(query)
aa = cursor.fetchall()
print len(aa)
hiveConn.close()
Beispiel #45
0
	def __init__(self, hostname='localhost', port=10000, schema='default', username='******', **kwargs):
		basic_conf = {'hive.cli.print.header' : 'false'}
		self.conn = llap.connect(host=hostname, port=int(port), username=username, database=schema, configuration=basic_conf) 
Beispiel #46
0
 def connect(self):
     return hive.connect(host=_HOST, configuration={'mapred.job.tracker': 'local'})