Beispiel #1
0
def get_connection(db_conn_conf, use_default_db):

    # Set up which database to use
    if use_default_db:
        db_name = "default"
    else:  # Set it to be the one specief in the config file
        db_name = db_conn_conf['database']

    # Create the connection
    if db_conn_conf['connector'] == 'hive':
        conn = pyhs2.connect(host=db_conn_conf['host'],
                 port=int(db_conn_conf['port']), 
                 authMechanism="PLAIN",
                 user=db_conn_conf['user'],
                 database=db_name)
        return conn
    elif db_conn_conf['connector'] == 'impala':
        conn = impala.dbapi.connect(host=db_conn_conf['host'],
                 port=db_conn_conf['port'], 
                 user=db_conn_conf['user'],
                 database=db_name)
        return conn
    else:
        print "Can't find connector"
        sys.exit(0)
Beispiel #2
0
 def start_requests(self):
     with pyhs2.connect(host='10.15.1.16',
                        port=10000,
                        authMechanism="PLAIN",
                        user='******',
                        password='******',
                        database='ods') as conn:
         with conn.cursor() as cur:
             the_day = time.strftime('%Y-%m-%d', time.localtime())
             cur.execute(
                 "select goods_id from ods.daily_goodsList where dt='{}' and user_id in (446338500,890482188,"
                 "133227658,1993730769,772352677,167873659, 98563612, 353042333, 1731961317, 353042353, 829273025, "
                 "2940972233, 2940677727, 435878238, 281917995, 737997431, 458599810, 1739810699, 1574853209, "
                 "1893742894, 1122478447, 373327370, 2647118809, 2786693231, 2434852658, 3000560259, 1916102784, "
                 "656650799, 2074964291, 708668355, 720472756, 356579667, 1731961317,98563612,1689954831,387266832"
                 ",2073309259,325718097,1891339807,834807033,1974964452,372602234,167486422,356374102,"
                 "320083279) group by goods_id".format(the_day))
             for i in cur.fetch():
                 nid = i[0]
                 url = 'https://api.m.taobao.com/h5/mtop.taobao.detail.getdetail/6.0/?appKey=12574478&t=14606167' \
                       '25586&sign=04b5eb36c2ccfebe0d39dab46de5ec18&api=mtop.taobao.detail.getdetail&v=6.0&ttid=' \
                       '2013%40taobao_h5_1.0.0&type=jsonp&dataType=jsonp&callback=&data=%7B%22itemNumId%22%3A%2' \
                       '2{}%22%2C%22exParams%22%3A%22%7B%5C%22id%5C%22%3A%5C%228548526%5C%22%2C%5C%22wp_app%5C' \
                       '%22%3A%5C%22weapp%5C%22%7D%22%7D'.format(nid)
                 #print url
                 yield scrapy.Request(url,
                                      headers=header,
                                      callback=self.parse_detail,
                                      meta={
                                          'nid': nid,
                                          'retry': 0
                                      },
                                      dont_filter=True)
Beispiel #3
0
def hs2_pull_data(query, buffername, max_id, field):
    with open('pass.json', 'rb') as passfile:
        passdict = json.load(passfile)
    with pyhs2.connect(host=passdict['HIVE_HOST'],
                       port=10000,
                       authMechanism='PLAIN',
                       user=passdict['HIVE_USER'],
                       database='default',
                       cursorclass=SSCursor) as conn:
        with gzip.open(
                '/home/epeters/public/dumps/%s_redshift.gz' % (buffername),
                'wb+') as gzip_csv_file:
            cur = conn.cursor()
            datacsv = csv.writer(gzip_csv_file,
                                 delimiter='|',
                                 quotechar='`',
                                 quoting=csv.QUOTE_MINIMAL)
            yesterday = datetime.datetime.utcnow() - datetime.timedelta(days=1)
            query = re.sub('@YEAR@', str(yesterday.year), query)
            query = re.sub('@MONTH@', str(yesterday.month), query)
            query = re.sub('@DAY@', str(yesterday.day), query)
            query = re.sub('@MAX_ID@', '%s > %s' % (max_id, field), query)
            #print "query is now:"
            #print query
            cur.execute(query)
            #print "Query execute successful"
            counter = 0
            for i in cur.fetch():
                counter += 1
                if counter % 10000 == 0:
                    print "counter now at %s" % (counter)
                datacsv.writerow(i)
        return cur.getSchema()
Beispiel #4
0
def getHiveData(loadsql, createsql, dropsql, selectsql):

    with pyhs2.connect(host='localhost',
                       port=10000,
                       authMechanism="PLAIN",
                       user='******',
                       password='******',
                       database='default') as conn:
        with conn.cursor() as cur:
            # Show databases
            print("--------")
            cur.execute(dropsql)
            print("--------")
            cur.execute(createsql)
            print("--------")
            cur.execute(loadsql)
            print("--------")
            cur.execute(selectsql)
            print("--------")
            # Return column info from query
            # print cur.getSchema()

            # Fetch table results
            result = []
            for i in cur.fetch():
                result.append(i)
                yield i
Beispiel #5
0
def purchase(id, purchase_type):
    """Returning the number of purchases for certain ids. Type of purchase (release or track) is needed"""

    # testing if purchase type is correct
    if purchase_type == 'release':
        ctype = 'release_id'
    elif purchase_type == 'track':
        ctype = 'track_id'
    else:
        raise AttributeError("provide valid purchase_type: 'release' or 'track'")

    #splitting ids
    cid = id_split(id)
    cid = cid.replace('src_id', ctype)

    # uncomment the line below to print created Hive query
    # print ('select count(transaction_id), %s from purchase where (%s) group by %s' % (ctype, cid, ctype))

    # connecting to Hive, sending query, returning results of query
    conn = pyhs2.connect(host='nif-nif.zvq.me', port=10000, authMechanism="NOSASL", user='******', password='******',
                         database='default')
    cur = conn.cursor()
    cur.execute('select count(transaction_id), %s from purchase where (%s) group by %s' % (ctype, cid, ctype))
    results = cur.fetch()
    cur.close()
    conn.close()
    return results
Beispiel #6
0
 def _set_hive(self):
     hive = pyhs2.connect(host=self.config['hive']['host'],
                          port=int(self.config['hive']['port']),
                          authMechanism='PLAIN',
                          user=self.config['hive']['username'],
                          password="")
     return hive.cursor()
Beispiel #7
0
    def process(self, tup):
        movie_id = tup.values[0]
        # [{'field_name' : 'field_value', ...}, ....]
        tweets = json.loads(tup.values[1])
        self.log('Received %d tweets for movie %s' % (len(tweets), movie_id))
        if self.verbose:
            self.log(tweets)

        with pyhs2.connect(host=get_config('host'),
                           port=get_config('port'),
                           authMechanism=get_config('auth'),
                           user=str(get_config('user')),
                           database=get_config('database')) as conn:
            with conn.cursor() as cur:
                query = "INSERT INTO " + get_config(
                    'tweet_data_tablename') + " VALUES "
                for t in tweets:
                    # Clean up the tweets before storing
                    text = unicodedata.normalize('NFKD', t['text']).encode(
                        'ascii', 'ignore')
                    text = re.sub(r'\'', '', text)
                    text = re.sub(r'\\', ' ', text)
                    text = re.sub(r'[\s\n\t\r]', ' ', text)
                    query += "('" + movie_id + "', " + str(
                        t['id']) + ", '" + text + "'),"
                if self.verbose:
                    self.log(query)
                cur.execute(query[:-1])  # -1 to remove last comma
Beispiel #8
0
def call_get_meta_data(table_name, sandbox_ip):
    try:
        with pyhs2.connect(host=sandbox_ip,
                           port=10000,
                           authMechanism="PLAIN",
                           user='******',
                           password='******',
                           database='bigdata') as conn:
            with conn.cursor() as cur:
                # Show databases
                # print cur.getDatabases()
                query = "desc formatted " + table_name
                print(query)
                cur.execute(query)
                meta_data = []
                flag = 0
                for i in cur.fetch():
                    if (i[0] == '' and flag == 0):
                        flag = 1
                    elif (i[0] == '' and flag == 1):
                        flag = 2
                    elif (flag == 1):
                        meta_data.append((i[0].strip(), i[1].strip()))
                return meta_data
    except Exception, e:
        print e
        return None
Beispiel #9
0
  def execute_sql (self, database_name, sql, fetch_result = False):
    import pyhs2
    conn = pyhs2.connect(host=self.host, port=self.port, authMechanism="PLAIN", user="******", password="", database='default', timeout=5000)
    print "Connected to hiverserver2"
    # turn on tez and add serde jar
    c = conn.cursor()
    c.execute("set hive.execution.engine=tez")
    c.execute("set hive.cache.expr.evaluation=false")
    # c.execute("add jar %s" % self.hive_serdes_path)

    if database_name != None:
      c.execute("use %s" % database_name)

    # run actual command command
    print "Executing HiveQL: %s" % (sql)
    c.execute(sql)

    output = []
    if fetch_result:
      rows = c.fetchall()
      for row in rows:
        output.append(row)

    c.close()
    conn.close()

    return output
    def processQuery(self, querylist,flag):

        propertyObj = ConfigProperties()
        hostname = propertyObj.localhivehost()
        portnumber = propertyObj.localhiveport()
        authentication = propertyObj.localhiveauthentication()
        username = propertyObj.localhiveuser()
        userpassword = propertyObj.localuserpassword()
        databasename = propertyObj.localhivedatabase()


        conn = pyhs2.connect(host=hostname, port = portnumber, authMechanism = authentication, user=username, password=userpassword, database = databasename)
        cur = conn.cursor()

        cur.execute(querylist['createdb'])
        cur.execute(querylist['workdb'])
        cur.execute(querylist['droptable1'])
        cur.execute(querylist['createtable1'])
        cur.execute(querylist['testcode'])
        cur.execute(querylist['droptable2'])
        cur.execute(querylist['createtable2'])

        if flag == 0:
            cur.close()
            conn.close()
    def load_data_orc(self,table_name):
	 with pyhs2.connect(host=self.env['HIVE_HOST'],port=10000,authMechanism="PLAIN", user=self.env['USER_NAME'],password='',database=self.config['HIVE_DB']) as conn:
          with conn.cursor() as cur:
	    #set_hive="SET hive.mergejob.maponly=true"
	    #self.logger.info("RUNNING at HIVE: %s",set_hive)
            #cur.execute(set_hive)

	    #set_hive=" SET hive.merge.mapredfiles=true"
	    #self.logger.info("RUNNING at HIVE: %s",set_hive)
            #cur.execute(set_hive)

	    #set_hive="SET hive.merge.mapfiles=true"
	    #self.logger.info("RUNNING at HIVE: %s",set_hive)
            #cur.execute(set_hive)
	
	    #set_hive="SET hive.merge.size.per.task=256000000"
	    #self.logger.info("RUNNING at HIVE: %s",set_hive)
            #cur.execute(set_hive)

	    #set_hive="SET hive.merge.smallfiles.avgsize=16000000000"
	    #self.logger.info("RUNNING at HIVE: %s",set_hive)
            #cur.execute(set_hive)

	    #self.logger.info("RUNNING at HIVE: %s",set_hive)
            #cur.execute(set_hive)
	    
	    hive_sql="INSERT INTO TABLE  "+self.config['HIVE_DB']+"."+table_name
	    #+" PARTITION ("+primary_id+"="+primary_value+")"
	    hive_sql=hive_sql+" SELECT "+self.table.get_table_column(table_name)+" from "+self.config['HIVE_DB']+"."+table_name+"_text"
	    print hive_sql
	    self.logger.info("RUNNING at HIVE: %s",hive_sql)
            cur.execute(hive_sql)
Beispiel #12
0
    def execute_sql(self, database_name, sql, fetch_result=False):
        import pyhs2
        conn = pyhs2.connect(host=self.host,
                             port=self.port,
                             authMechanism="NOSASL",
                             database='default')

        # turn on tez and add serde jar
        c = conn.cursor()
        c.execute("set hive.execution.engine=tez")
        c.execute("set hive.cache.expr.evaluation=false")
        c.execute("add jar %s" % self.hive_serdes_path)

        if database_name != None:
            c.execute("use %s" % database_name)

        # run actual command command
        print "Executing HiveQL: %s" % (sql)
        c.execute(sql)

        output = []
        if fetch_result:
            rows = c.fetchall()
            for row in rows:
                output.append(row)

        c.close()
        conn.close()

        return output
Beispiel #13
0
    def connect(self):
      print "I'm running but will hang some time. Please be patient..."
      
  
      with pyhs2.connect(host='cosmos.lab.fi-ware.org',
                         port=10000,
                         authMechanism="PLAIN",
                         user='',
                         password='',
                         database='default') as conn:
          with conn.cursor() as self.cur:
              #Show databases
              #print cur.getDatabases()

              #Execute query
              self.cur.execute("select * from andre_silva_fresh_serv_fresh_servpath_sensor_9_sensor_column")
              self.db_zone1_cols = []
              for i in self.cur.getSchema():
                  if("_md" not in i['columnName'].split('.')[1] and
                     "recv" not in i['columnName'].split('.')[1]):
                      self.db_zone1_cols.append(i['columnName'].split('.')[1])
              self.db_zone1_rows = self.cur.fetch()
              
              
              self.cur.execute("select * from andre_silva_fresh_serv_fresh_servpath_sensor_10_sensor_column")
              self.db_zone2_cols = []
              for i in self.cur.getSchema():
                  if("_md" not in i['columnName'].split('.')[1] and
                     "recv" not in i['columnName'].split('.')[1]):
                      self.db_zone2_cols.append(i['columnName'].split('.')[1])
              self.db_zone2_rows = self.cur.fetch()
              

      print "Whoa! I have a database!"
Beispiel #14
0
    def create_original_table(self, table_name):
        table_statement = self.table.get_table_column_with_type(
            table_name) + " )"
        #if table_statement.find("PARTITION BY") > 0:
        #partition_column=table_statement.split("PARTITION BY")[1].replace(table_name+".","").replace("(","").replace(")","").replace(" ","")
        #sql = "select data_type from v_catalog.columns where column_name= '"+partition_column+"' and table_name='"+table_name+"'"
        #data_type = self.db.retrieveSQL(sql)
        #data_type=data_type[0][0]
        #table_statement=table_statement.split("PARTITION BY")[0]
        #table_statement=first_part+ "PARTITIONED BY ("+ partition_column+ " "+data_type +" ) "
        #table_statement=table_statement.replace(partition_column+ " "+data_type+",","")

        table_statement = table_statement + " ROW FORMAT DELIMITED STORED AS ORC tblproperties ('orc.compress'='" + self.config[
            'SQOOP_COMPRESSION'] + "','orc.stripe.size'='" + self.config[
                'ORC_STRIPE_SIZE'] + "','orc.row.index.stride'='" + self.config[
                    'ORC_INDEX_STRIDE'] + "','orc.create.index'='true')"
        table_statement = table_statement.replace("\n", " ")
        table_statement = table_statement.replace("numeric", "decimal")
        table_statement = table_statement.replace("timestamp(6)", "timestamp")
        table_statement = "CREATE EXTERNAL TABLE " + self.config[
            'HIVE_DB'] + "." + table_name + " ( " + table_statement

        print table_statement

        self.logger.info("RUNNING at HIVE: %s", table_statement)
        with pyhs2.connect(host=self.env['HIVE_HOST'],
                           port=10000,
                           authMechanism="PLAIN",
                           user=self.env['USER_NAME'],
                           password='',
                           database=self.config['HIVE_DB']) as conn:
            with conn.cursor() as cur:
                cur.execute(table_statement)
                print cur.getDatabases()
def aggregate():
    with pyhs2.connect(host='hive.athena.we7.local',
                       port=10000,
                       authMechanism="KERBEROS",
                       user='',
                       password='',
                       database='davec_sandbox'
                      ) as conn:

        with conn.cursor() as cur:

            cur.execute('''add file hdfs://athena/user/davec/agg_segment_daily_reducer.py''')

            # Hive chooses only one reducer by default (28 minutes). Force 15 (2.5 mins).
            cur.execute('''set mapred.reduce.tasks=15''')
            cur.execute('''create table if not exists davec_sandbox.agg_segment_daily (
                              segment_date    string,
                              segment_type    string,
                              user_segment    string,
                              users           int
                            )
                        ''')
            cur.execute('''
                            insert overwrite table davec_sandbox.agg_segment_daily
                            select segment_date,
                                   segment_type,
                                   user_segment,
                                   sum(cast(users as int))
                            from (
                                  select transform(*)
                                  using 'agg_segment_daily_reducer.py'
                                     as ( segment_date,
                                          segment_type,
                                          user_segment,
                                          users
                                        )
                                  from (
                                        select user_id,
                                               segment_type,
                                               user_segment,
                                               fact_year,
                                               fact_month,
                                               fact_day
                                        from events_super_mart.fact_user_segment
                                        --test--where segment_type = 'Value Segment'
                                        --test--and fact_year = 2014
                                        --test--and fact_month = 11
                                        distribute by user_id,
                                                      segment_type
                                        sort by user_id,
                                                segment_type,
                                                fact_year,
                                                cast(fact_month as int),
                                                cast(fact_day as int)
                                  ) user_segment
                            ) segment_by_date
                            group by segment_date,
                                     segment_type,
                                     user_segment
                        ''')
Beispiel #16
0
    def query2hive(self,sql="SHOW TABLES",use_server=True):
        if use_server:
            host = "137.205.118.65"
        else:
            host = "localhost"


        with pyhs2.connect(host=host,
                       port=10000,
                       authMechanism="PLAIN",
                       user='******',
                       password='******',
                       database='default') as conn:
            with conn.cursor() as cur:
                #Show databases
                # print cur.getDatabases()

                #Execute query
                # cur.execute("select * from src")
                start = datetime.now()
                cur.execute(sql)
                end = datetime.now()
                time_cost =  (end - start).total_seconds()
                #Return column info from query
                # print cur.getSchema()

                #Fetch table results
                self.logger.logger.info("Time spent for HIVE query: %.4fs." % time_cost)
                for i in cur.fetch():
                    self.logger.logger.info(i)
        return i[0], time_cost
Beispiel #17
0
def getSchema():
    settings = get_settings_from_file("spec.json")
    print(settings)

    conn = pyhs2.connect(host=settings.Param.HiveServer2_Host,
            port=int(settings.Param.HiveServer2_Port),
            authMechanism="PLAIN",
            user="******",
            password="",
            database="default")
    query_sql = "DESCRIBE    %s" % settings.Input.table_a.val
    cur = conn.cursor()
    cur.execute(query_sql)

    a_schema = []
    for row in cur.fetch():
       a_schema.append(("a.%s AS a_%s") %(row[0],row[0]))
  
    query_sql = "DESCRIBE    %s" % settings.Input.table_b.val
    cur = conn.cursor()
    cur.execute(query_sql)

    b_schema = []
    for row in cur.fetch():
       b_schema.append(("b.%s AS b_%s")%(row[0],row[0]))
             
    cur.close()
    conn.close()
    return [a_schema,b_schema]
Beispiel #18
0
 def start_requests(self):
     with pyhs2.connect(host='10.15.1.16',
                        port=10000,
                        authMechanism="PLAIN",
                        user='******',
                        password='******',
                        database='ods') as conn:
         with conn.cursor() as cur:
             the_day = time.strftime('%Y-%m-%d', time.localtime())
             # the_day = '2018-01-04'
             cur.execute(
                 "select goods_id from ods.daily_goodsList where (dt='2017-11-08' or dt='2017-11-07' or "
                 "dt='2017-12-03' or dt='2018-01-04' or dt='{}') and user_id in (446338500,890482188,"
                 "133227658,1993730769,772352677,167873659, 98563612, 353042333, 1731961317, 353042353, 829273025, "
                 "2940972233, 2940677727, 435878238, 281917995, 737997431, 458599810, 1739810699, 1574853209, "
                 "1893742894, 1122478447, 373327370, 2647118809, 2786693231, 2434852658, 3000560259, 1916102784, "
                 "656650799, 2074964291, 708668355, 720472756, 356579667, 1731961317,98563612,1689954831,387266832"
                 ",2073309259,325718097,1891339807,834807033,1974964452,372602234,167486422,356374102,"
                 "320083279,411832242,1602582004,2428721558,1574853209,2986712394,2183615086,2424477833,1754310760,1754310760,1122478447,3000560259,2652614726,533230328,2935707588,2945786195,2074690906,3458347554,3383168585,2366121327,3099864367,106852162,152579056,1754310760,783329018,1600687454,205919815,113484749,94092459,6655,2978259752) group by goods_id"
                 .format(the_day))
             # print '************'
             for i in cur.fetch():
                 nid = i[0]
                 url = 'https://rate.tmall.com/listTagClouds.htm?itemId={}&isAll=true&isInner=true&t=1510632615425' \
                       '&_ksTS=1510632615425_2407&callback=jsonp2408'.format(nid)
                 yield scrapy.Request(url,
                                      headers=header,
                                      callback=self.parse_keyword,
                                      meta={'nid': nid},
                                      dont_filter=True)
    def connect(self, config):
        """
        connect hiveServer2 with your config

        Parameters:
        -----------
        config : dict.
          | config['host'] The host ip of hive server 2
          | config['port'] The port of hive server 2
          | config['authMechanism'] Most time the value is "NOSASL"
          | config['user'] The connect user
          | config['password'] The connect password
          | config['database'] The database which you want to connect

        Returns:
        --------
        None
        """

        self.conn = pyhs2.connect(host=config['host'],
                                  port=config['port'],
                                  authMechanism=config['authMechanism'],
                                  user=config['user'],
                                  password=config['password'],
                                  database=config['database'])
        self.cur = self.conn.cursor()
Beispiel #20
0
def get_connection(db_conn_conf, use_default_db):

    # Set up which database to use
    if use_default_db:
        db_name = "default"
    else:  # Set it to be the one specief in the config file
        db_name = db_conn_conf['database']

    # Create the connection
    if db_conn_conf['connector'] == 'hive':
        conn = pyhs2.connect(host=db_conn_conf['host'],
                             port=int(db_conn_conf['port']),
                             authMechanism="PLAIN",
                             user=db_conn_conf['user'],
                             database=db_name)
        return conn
    elif db_conn_conf['connector'] == 'impala':
        conn = impala.dbapi.connect(host=db_conn_conf['host'],
                                    port=db_conn_conf['port'],
                                    user=db_conn_conf['user'],
                                    database=db_name)
        return conn
    else:
        print "Can't find connector"
        sys.exit(0)
Beispiel #21
0
def getHiveData(loadsql,createsql,dropsql,selectsql):

	with pyhs2.connect(host='localhost',
        	           port=10000,
        	           authMechanism="PLAIN",
        	           user='******',
        	           password='******',
        	           database='default') as conn:
    		with conn.cursor() as cur:
    			#Show databases
    			#print cur.getDatabases()
			print("--------")

			cur.execute(dropsql)
			print("--------")
			cur.execute(createsql)	
			print("--------")
			cur.execute(loadsql)
			print("--------")			
			cur.execute(selectsql)
			print("--------")			
	  		#Return column info from query
       			#print cur.getSchema()

        		#Fetch table results
			result = []
        		for i in cur.fetch():
            			result.append(i)
				yield i
Beispiel #22
0
def get_hiveserver2_results_schema(hiveserver_config_dict, db, sql):
    host = hiveserver_config_dict["host"]
    port = hiveserver_config_dict["port"]
    username = hiveserver_config_dict["username"]
    password = hiveserver_config_dict["password"]
    auth = hiveserver_config_dict["auth"]
    connection = pyhs2.connect(host=host,
                               port=int(port),
                               authMechanism=auth,
                               user=username,
                               password=password,
                               database=db)

    cursor = connection.cursor()
    cursor.execute(sql + " limit 0")
    schemas = cursor.getSchema()

    connection.close()

    column_name_type_list = []
    print "result schemas:", schemas
    for column in schemas:
        column_name_type_list.append({
            "name": column['columnName'], "type": HiveType.change_hiveserver2_type(column['type'])
        })

    return column_name_type_list
Beispiel #23
0
    def __init__(self,using=''):
        """
        @param cursor_hander:数据库句柄
        """
        self.cursor = None
        self.cursor_hander = using
        self.connections = None
        
        if str(self.cursor_hander).rstrip() == '':
            print('please write Custom_Hive`s using param')
            exit(0)

                        
        databases = {
            'ares_dw':{'host':'10.0.0.2', 'user':'******', 'password':'', 'database':'test', 'port':10000 ,'authMechanism':'NOSASL'},
            'hadoops2':{'host':'10.0.0.2', 'user':'******', 'password':'', 'database':'test', 'port':10000 ,'authMechanism':'NOSASL'},
		    'basic_data':{'host':'10.0.0.2', 'user':'******', 'password':'', 'database':'basic_data', 'port':10000 ,'authMechanism':'NOSASL'}
        }
        
        database = databases[self.cursor_hander]
	
	self.connections= pyhs2.connect(host=database['host'],
                      port= int(database['port']),
                      authMechanism= database['authMechanism'],
                      user=database['user'],
                      password=database['password'],
                      database=database['database'],
                      )

        self.cursor = self.connections.cursor()
Beispiel #24
0
def get_hive_connection(master_host):
    c = pyhs2.connect(host=master_host,
                      port=10000,
                      authMechanism="PLAIN",
                      user='******',
                      database='default')
    return c
Beispiel #25
0
 def test_result_sets_match(self):
     for lim in (0,100,10000,11000,20000,21000):
         with pyhs2.connect(**self.hive_cfg) as conn:
             with conn.cursor() as cur:
                 cur.execute('SELECT * FROM {db}.{tbl} LIMIT {limit}'.format(db=self.test_db, tbl=self.test_tbl, limit=lim))
                 res = cur.fetch()
         self.assertEqual(res, self.test_data[:lim])
Beispiel #26
0
    def __call__(self):
        try:
            self.conn = pyhs2.connect(host=self.host,
                                      port=self.port,
                                      authMechanism="PLAIN",
                                      user=self.user,
                                      password=self.password,
                                      database=self.database,
                                      timeout=self.timeout)

            if self.conn is None:
                raise Exception("can't connect hiveserver")

            cur = self.conn.cursor()
            cur.execute("show tables")
            if len(cur.fetch()) > 0:
                self._readq.nput("hive.state %s %s" % (int(time.time()), '0'))
            else:
                raise Exception("can not find any tables")
        except Exception as e:
            self._readq.nput("hive.state %s %s" % (int(time.time()), '1'))
            self.log_error(e)

        finally:
            if self.conn is not None:
                self.conn.close()
Beispiel #27
0
 def __GetConnect(self):
     if self.database == 'sqlserver':
         import pyodbc
         conn_info = 'DRIVER={SQL Server};DATABASE=%s;SERVER=%s;UID=%s;PWD=%s' % (
             self.db, str(self.host) + ',' + str(self.port), self.user,
             self.pwd)
         conn = pyodbc.connect(conn_info)
         # except:
         #     import pymssql
         #     conn = pymssql.connect(host=str(self.host)+':'+str(self.port), user=self.user, password=self.pwd, database=self.db)
     elif self.database == 'hive':
         import pyhs2
         conn = pyhs2.connect(host=self.host,
                              port=self.port,
                              user=self.user,
                              authMechanism="PLAIN",
                              password=self.pwd)
     else:
         import pyodbc
         conn_info = (
             'Driver={MySQL ODBC 5.1 Driver};Server=%s;Port=%s;Database=%s;User=%s; Password=%s;Option=3;'
             % (self.host, self.port, self.db, self.user, self.pwd))
         conn = pyodbc.connect(conn_info)
     self.conn = conn
     cur = self.conn.cursor()  #  使用conn连接创建(并返回)一个游标的对象
     if not cur:
         raise (NameError, "数据库连接失败")
     else:
         return cur
Beispiel #28
0
 def __init__(self, db_host, user, password, port=10000, authMechanism="PLAIN"):
     self.conn = pyhs2.connect(host=db_host,
                               port=port,
                               authMechanism=authMechanism,
                               user=user,
                               password=password,
                               )
Beispiel #29
0
def get_tweet(consumer):
    try:
        for message in consumer:
            if message is not None:
                #print message.offset, message.value, time.strftime("%Y%m")
                with pyhs2.connect(host='localhost',
                                   port=10000,
                                   authMechanism="PLAIN",
                                   user='******',
                                   password='******',
                                   database='default') as conn:
                    with conn.cursor() as cur:
                        #Show databases
                        print cur.getDatabases()

                        #Execute query
                        cur.execute(
                            "create table if not exists twitter_grippe(ID varchar(255), tweet string, date_month string)"
                        )
                        cur.execute(
                            "INSERT INTO table twitter_grippe values ('{}','{}','{}')"
                            .format(message.offset, message.value,
                                    time.strftime("%Y%m")))
            elif not message:
                print 'No message'
            else:
                print 'Something else happened..'
    except KeyboardInterrupt as e:
        pass
    return message.offset, message.value, time.strftime("%Y%m")
def distcp_and_import(
        src_data_block_url,
        table_name,
        hive_table_meta,
        sandbox_ip,
        file_path):
    dis_data_block_url = 'hdfs://' + sandbox_ip + ':9000' + file_path

    status_distcp = []
    status_distcp.append(
            distcp(src_data_block_url, dis_data_block_url))


    status_import = []
    try:
        with pyhs2.connect(host=sandbox_ip, port=10000, authMechanism="PLAIN", user='******', password='******', database='default') as conn:
            with conn.cursor() as cur:
                # Show databases
                # print cur.getDatabases()
                query = "create external table " + table_name + \
                    "(" + hive_table_meta + ")" + " ROW FORMAT DELIMITED FIELDS TERMINATED BY ',' STORED AS TEXTFILE LOCATION " + "'" + file_path + "'"
                print(query)
                print(cur.execute(query))
                result = 0
    except BaseException:
        result = 1
    status_import.append(result)

    print [status_distcp, status_import]
Beispiel #31
0
    def process(self, tup):
        movie_id = tup.values[0]
        # [{'field_name' : 'field_value', ...}, ....]
        tweets = json.loads(tup.values[1])
        # For debugging.
        #movie_id = tup[0]
        #tweets = json.loads(tup[1])
        self.log('Received %d tweets for movie %s' % (len(tweets), movie_id))

        tweets_bow = []
        for t in tweets:
            text = self.processTweet(t['text'])
            bag_of_words = self.getWordsSet(text)
            tweets_bow.append(bag_of_words)

        features = self.build_features(tweets_bow)

        pred = self.classifier.predict(features)
        pos_count = sum(pred)
        neg_count = len(pred) - pos_count

        self.log('SE: %s   +ve:%d   -ve:%d' % (movie_id, pos_count, neg_count))

        with pyhs2.connect(host = get_config('host'), port = get_config('port'), authMechanism = get_config('auth'),
                           user = str(get_config('user')), database = get_config('database')) as conn:
            with conn.cursor() as cur:
                query = ("INSERT INTO " + get_config('se_score_tablename') + " VALUES (" + str(int(time.time()))
                         + ", '" + movie_id + "', " + str(pos_count) + ", " + str(neg_count) + ")")
                if self.verbose:
                    self.log(query)
                cur.execute(query)
Beispiel #32
0
def listen(id, listen_type):
    """Returning the number of playevents longer than 30 sec with ok flag for certain ids"""

    # testing if purchase type is correct
    if listen_type not in ('release', 'track', 'playlist'):
        raise AttributeError("provide valid listen_type: 'release', 'playlist' or 'track'")

    #splitting ids
    cid = id_split(id)

    # uncomment the line below to print created Hive query
    # print ("select count(1), src_id, src_type from playevent"
    #     "where (%s) and src_type = '%s' and play_duration > 30 and ok_flag"
    #     "group by src_id, src_type" % (cid, listen_type))

    # connecting to Hive, sending query, returning results of query
    conn = pyhs2.connect(host='nif-nif.zvq.me', port=10000, authMechanism="NOSASL", user='******', password='******',
                         database='default')
    cur = conn.cursor()
    cur.execute(
        "select count(1), src_id, src_type from playevent"
        "where (%s) and src_type = '%s' and play_duration > 30 and ok_flag"
        "group by src_id, src_type" % (cid, listen_type))
    results = cur.fetch()
    cur.close()
    conn.close()
    return results
def apriori():
    with pyhs2.connect(host="localhost",
                       port=10000,
                       authMechanism="PLAIN",
                       user="******",
                       password="******",
                       database=u"minor") as conn:
        file_iter = open('data', 'w')

        with conn.cursor() as cur:
            cur.execute("select * from trainingset")
            for i in cur.fetch():
                for j in i:
                    file_iter.write(str(j) + ",")
                file_iter.write("\n")
            file_iter.close()

    inFile = dataFromFile('data')
    parametera = raw_input("first parameter")
    parameterb = raw_input("second parameter")

    minSupport = 0.15
    minConfidence = 0.15
    items, rules = runApriori(inFile, minSupport, minConfidence)
    printResults(items, rules, str(parametera), str(parameterb))
Beispiel #34
0
def create(descriptor_path, environment):
    with open(descriptor_path) as descriptor_file:
        contents = descriptor_file.read()
        descriptor = json.loads(contents)

    logging.debug("_deploy_hbase: %s", descriptor)
    hbase = starbase.Connection(host=environment['hbase_rest_server'],
                                port=int(environment['hbase_rest_port']))
    hive_host = environment['hive_server']
    hive_port = environment['hive_port']
    hive = pyhs2.connect(host=hive_host,
                         port=hive_port,
                         authMechanism="PLAIN",
                         user='******',
                         password='******',
                         database='default')

    for element in descriptor:
        if 'table' in element and 'col_family' in element:
            table = hbase.table('%s' % element['table'])
            table.create(element['col_family'])
            for qry in element['hive_schema']:
                hive.cursor().execute(qry)

    hive.close()
Beispiel #35
0
def hive_query(sqls, is_select=True, meta=False):
    '''
    执行 hive 的sql
    :param sqls:
    :param is_select: 是否是select
    :param meta:
    :return:
    '''
    hive_connection = {
        "database": "elengjing",
        "host": "172.16.1.12",
        "user": "******",
        "password": "******",
        "port": 10000,
        "authMechanism": "PLAIN"
    }
    conn = pyhs2.connect(**hive_connection)
    try:
        with conn.cursor() as cursor:
            for sql in sqls.split(";"):
                if sql.strip():
                    cursor.execute(sql.strip().encode('utf8'))

            if is_select:
                columns = [_['columnName'] for _ in cursor.getSchema()]
                rows = [dict(zip(columns, _)) for _ in cursor]
                if meta:
                    return rows, columns
                else:
                    return rows
    except Exception, e:
        raise e
Beispiel #36
0
def get_cursor():
        conn = pyhs2.connect(host='',
               port=10000,
               authMechanism="PLAIN",
               user='******',
               password='',
               database='test')
        return conn.cursor()
Beispiel #37
0
 def get_conn(self):
     db = self.get_connection(self.hiveserver2_conn_id)
     return pyhs2.connect(host=db.host,
                          port=db.port,
                          authMechanism=db.extra_dejson.get(
                              'authMechanism', 'NOSASL'),
                          user=db.login,
                          database=db.schema or 'default')
Beispiel #38
0
 def get_connection(self):
     conn = pyhs2.connect(host=self.params.get('host'),
                          port=self.params.get('port'),
                          authMechanism=self.params.get('authMechanism'),
                          user=self.params.get('user'),
                          password=self.params.get('password'),
                          database=self.params.get('db'))
     return conn
Beispiel #39
0
 def __init__(self, **hive_cfg):
     self.args = self.create_hive_config(**hive_cfg)
     self.columns = []
     try:
         self.conn = pyhs2.connect(**self.args)
         self.cursor = self.conn.cursor()
     except Exception as e:
         logging.error("Could not get a cursor. Reason:\n %s", e)
Beispiel #40
0
 def __init__(self,database_name):
     conn = pyhs2.connect(host='192.168.11.55',
                port=10000,
                authMechanism="PLAIN",
                user='******',
                password='',
                database= database_name)
     self.cur=conn.cursor()
Beispiel #41
0
 def get_conn(self):
     db = self.get_connection(self.hiveserver2_conn_id)
     return pyhs2.connect(
         host=db.host,
         port=db.port,
         authMechanism=db.extra_dejson.get('authMechanism', 'NOSASL'),
         user=db.login,
         database=db.schema or 'default')
Beispiel #42
0
    def insert_into_pb_dim_pred(self, strat_algos):
        logging.info('')
        logging.info(" Inserting data into hive ...")

        insert_statement = """
        INSERT OVERWRITE TABLE {}.pom_pb_dimension_predictions PARTITION (algo_id)
        SELECT DISTINCT *
        FROM
          (SELECT strategy_id,
                  dimension_value_structs,
                  probability_score,
                  algo_id
           FROM
             (SELECT opti.strategy_id,
                     row_number() over (partition BY opti.strategy_id
                                        ORDER BY preds.cumulative_users ASC) AS row_n,
                     NAMED_STRUCT('dimension_type_id', 1, 'dimension_value', NAMED_STRUCT('first', CAST(sd.site_domain_id AS STRING), 'last', "", 'unit', "")) AS dimension_value_structs,
                     preds.percentage_recommendations AS probability_score,
                     opti.algorithm_id AS algo_id
              FROM {}.pom_programmable_bid_domainbidderinputfield opti
              JOIN radiumone_master.linez l ON opti.line_id = l.line_id
              JOIN dchugh.line_recommender_cumsum preds ON l.line_id = preds.line_id
              JOIN radiumone_master.SITE_DOMAINS sd ON (preds.domain_name = sd.NAME)
              ) first_domain
           WHERE first_domain.row_n = 1
           UNION ALL SELECT opti.strategy_id,
                            NAMED_STRUCT('dimension_type_id', 1, 'dimension_value', NAMED_STRUCT('first', CAST(sd.site_domain_id AS STRING), 'last', "", 'unit', "")) AS dimension_value_structs,
                            preds.percentage_recommendations AS probability_score,
                            opti.algorithm_id AS algo_id
           FROM {}.pom_programmable_bid_domainbidderinputfield opti
           JOIN radiumone_master.linez l ON opti.line_id = l.line_id
           JOIN dchugh.line_recommender_cumsum preds ON l.line_id = preds.line_id
           JOIN radiumone_master.SITE_DOMAINS sd ON (preds.domain_name = sd.NAME)
           WHERE (preds.cumulative_users <= opti.unique_count or (opti.unique_count = -1)) ) full_query
        ORDER BY strategy_id
        """.format(pb_properties.hive_db, pb_properties.hive_db,
                   pb_properties.hive_db)

        with pyhs2.connect(
                host=pb_properties.hive_server_host,
                user=pb_properties.hive_server_user,
                password=pb_properties.hive_server_password,
                port=pb_properties.hive_server_port,
                authMechanism=pb_properties.hive_server_auth) as conn:
            with conn.cursor() as cur:
                logging.info('')
                logging.info(
                    ' In host, jobserver, executing the hql statement: ' +
                    insert_statement)

                hive_settings_dyn_partition_enable = "set hive.exec.dynamic.partition=true"
                hive_settings_dyn_partition_mode = "set hive.exec.dynamic.partition.mode=nonstrict"

                cur.execute(hive_settings_dyn_partition_enable)
                cur.execute(hive_settings_dyn_partition_mode)

                cur.execute(insert_statement)
Beispiel #43
0
    def get_conn(self):
        db = self.get_connection(self.hiveserver2_conn_id)
        auth_mechanism = db.extra_dejson.get("authMechanism", "NOSASL")
        if conf.get("core", "security") == "kerberos":
            auth_mechanism = db.extra_dejson.get("authMechanism", "KERBEROS")

        return pyhs2.connect(
            host=db.host, port=db.port, authMechanism=auth_mechanism, user=db.login, database=db.schema or "default"
        )
Beispiel #44
0
 def _set_hive(self):
     hive = pyhs2.connect(
         host=self.config['hive']['host'],
         port=int(self.config['hive']['port']),
         authMechanism=self.config['hive']['authMechanism'],
         user=self.config['hive']['username'],
         password="",
         database=self.config['hive']['db'])
     return hive.cursor()
Beispiel #45
0
    def run(self, connection, date):
        target_date = (date - datetime.timedelta(days=1)).strftime('%Y-%m-%d')

        raw_adi_logs_pathname = os.path.join(
            tempfile.gettempdir(),
            "%s.raw_adi_logs.TEMPORARY%s" % (
                target_date,
                '.txt'
            )
        )
        try:
            with open(raw_adi_logs_pathname, 'w') as f:
                hive = pyhs2.connect(
                    host=self.config.hive_host,
                    port=self.config.hive_port,
                    authMechanism=self.config.hive_auth_mechanism,
                    user=self.config.hive_user,
                    password=self.config.hive_password,
                    database=self.config.hive_database
                )

                cur = hive.cursor()
                query = self.config.query % target_date
                cur.execute(query)
                for row in cur:
                    if None not in row:
                        f.write(
                            "\t"
                            .join(
                                urllib2.unquote(str(v)).replace('\\', '\\\\')
                                for v in row
                            )
                        )
                        f.write("\n")

            with open(raw_adi_logs_pathname, 'r') as f:
                pgcursor = connection.cursor()
                pgcursor.copy_from(
                    f,
                    'raw_adi_logs',
                    null='None',
                    columns=[
                        'report_date',
                        'product_name',
                        'product_os_platform',
                        'product_os_version',
                        'product_version',
                        'build',
                        'build_channel',
                        'product_guid',
                        'count'
                    ]
                )
                pgcursor.execute(_RAW_ADI_QUERY, (target_date,))
        finally:
            if os.path.isfile(raw_adi_logs_pathname):
                os.remove(raw_adi_logs_pathname)
Beispiel #46
0
 def __init__(self, host, port, username, password, db, table_name,
              column_order):
     self.conn = pyhs2.connect(host=host,
                               port=port,
                               authMechanism="PLAIN",
                               user=username,
                               password=password,
                               database=db)
     self.table_name = table_name
     self.column_order = column_order
Beispiel #47
0
	def __init__(self, host, port, user, passwd, db):
		self._host = host
		self._port = port
		self._user = user
		self._passwd = passwd
		self._db = db

		self._conn = pyhs2.connect(host = self._host, port = self._port, authMechanism = "PLAIN", user = self._user, password = self._passwd, database = self._db)

		self._cur = self._conn.cursor()
Beispiel #48
0
    def get_conn(self):
        db = self.get_connection(self.hiveserver2_conn_id)
        auth_mechanism = db.extra_dejson.get('authMechanism', 'NOSASL')
        if conf.get('core', 'security') == 'kerberos':
            auth_mechanism = db.extra_dejson.get('authMechanism', 'KERBEROS')

        return pyhs2.connect(
            host=db.host,
            port=db.port,
            authMechanism=auth_mechanism,
            user=db.login,
            database=db.schema or 'default')
	def __init__(self):
		host = "dma-tools-01.eng.solidfire.net"
		auth = "PLAIN"
		user = "******"
		password = "******"
		database = "aiq_prod"
		self.hive = pyhs2.connect(host=host,
								  port=10000,
								  authMechanism=auth,
								  user=user,
								  password=password,
								  database=database)
def loadResToHive(host,port,database,user,password,authMechanism,resultFilePath,partition):
    #connect to hive database
   with pyhs2.connect(host=host,
                port=port,
                database=database,
                user=user,
                password=password,
                authMechanism=authMechanism) as conn:
        with conn.cursor() as cur:
            str="load data local inpath "+resultFilePath+" into table IntRes partition(dt='"+partition+"')"
            # cur.execute("load data local inpath '/home/hadoop/PycharmProjects/IDWInterpolation/concatenateResult.txt' into table IntRes partition(dt='2016-03-01')")
            cur.execute(str)
Beispiel #51
0
 def conn(cls):
     """
     return the hive connection handle to the configured server
     """
     config = cls.config
     try:
         connection = hive.connect(host=config['host'], port=config.get('port', cls.DEFAULT_PORT),
                                   authMechanism='NOSASL', user=config['username'],
                                   password=config['password'])
     except Exception, e:
         raise DbError("Cannot connect to Hive Server: %s\n"
                       "Ensure that the server is running and you can connect normally"
                       % e.message)
def make_connection(delimiter, host, user, pw, database):

    conn = pyhs2.connect(host=host,
                           user=user,
                           database=database,
                           password=pw,
                           port=10000,
                           authMechanism="PLAIN")

    cur = conn.cursor()
    # Show databases
    #print cur.getDatabases()
    return cur, conn
Beispiel #53
0
 def create_connection(self):
     """
     Override the create_connection from the Netezza 
     class which get's called in it's initializer
     """
     import pyhs2
     conn = pyhs2.connect(host=self.host, 
                          port=self.port,
                          authMechanism=self.auth_mechanism, 
                          user=self.user, 
                          password=self.password, 
                          database=self.database)
     return conn
    def create_flat_table(self, table_name):
	table_statement=self.table.get_table_column_with_type(table_name) +" )"
        table_statement=table_statement +" ROW FORMAT DELIMITED" 
	table_statement=table_statement+" FIELDS TERMINATED BY ',' LINES TERMINATED BY "
	table_statement=table_statement+" '\\n' STORED AS TEXTFILE LOCATION '/user/sqoop2/"+table_name+"/'"
        table_statement=table_statement.replace("numeric","decimal")
        table_statement=table_statement.replace("timestamp(6)","timestamp")
	table_statement="CREATE EXTERNAL TABLE " +self.config['HIVE_DB']+"."+ table_name+"_text ("+table_statement

	self.logger.info("RUNNING at HIVE: %s",table_statement)
        with pyhs2.connect(host=self.env['HIVE_HOST'],port=10000,authMechanism="PLAIN", user='******',password='',database=self.config['HIVE_DB']) as conn:
          with conn.cursor() as cur:
            cur.execute(table_statement)
Beispiel #55
0
def isConnected():
    with pyhs2.connect(host='localhost',
                       port=10000,
                       authMechanism="PLAIN",
                       user='******',
                       password='******',
                       database='default') as conn:
        with conn.cursor() as cur:
            # Show databases
            try:
                cur.getDatabases()
                return True
            except Exception:
                return False
def getDataForInt():
    with pyhs2.connect(host='172.20.111.54',
                    port=10001,
                    database='default',
                    user='******',
                    password='******',
                    authMechanism="PLAIN") as conn:
        with conn.cursor() as cur:
        #get the table which stores the geo info of the meteorological station
            cur.execute("select site,value from awsrain")
        #cur.execute("select * from awsstationdesc")
            awsrain=cur.fetch()

    return awsrain
Beispiel #57
0
 def get_results(self, hql, schema='default', arraysize=1000):
     schema = schema or 'default'
     with pyhs2.connect(
             host=self.hiveserver2_conn.host,
             port=self.hiveserver2_conn.port,
             authMechanism="NOSASL",
             user='******',
             database=schema) as conn:
         with conn.cursor() as cur:
             cur.execute(hql)
             return {
                 'data': cur.fetchall(),
                 'header': cur.getSchema(),
             }
def getStationInfo(host,port,database,user,password,authMechanism):
    with pyhs2.connect(host=host,
                    port=port,
                    database=database,
                    user=user,
                    password=password,
                    authMechanism=authMechanism) as conn:
        with conn.cursor() as cur:
        #get the table which stores the geo info of the meteorological station
            cur.execute("select sid,lon,lat from awsstationdesc")
        #cur.execute("select * from awsstationdesc")
            listStationInfo=cur.fetch()
            del listStationInfo[0]

    return listStationInfo