def run_query(self, query, user): connection = presto.connect( host=self.configuration.get('host', ''), port=self.configuration.get('port', 8080), username=self.configuration.get('username', 'redash'), catalog=self.configuration.get('catalog', 'hive'), schema=self.configuration.get('schema', 'default')) cursor = connection.cursor() try: cursor.execute(query) column_tuples = [(i[0], PRESTO_TYPES_MAPPING.get(i[1], None)) for i in cursor.description] columns = self.fetch_columns(column_tuples) rows = [dict(zip(([c['name'] for c in columns]), r)) for i, r in enumerate(cursor.fetchall())] data = {'columns': columns, 'rows': rows} json_data = json.dumps(data, cls=JSONEncoder) error = None except DatabaseError as db: json_data = None default_message = 'Unspecified DatabaseError: {0}'.format(db.message) message = db.message.get('failureInfo', {'message', None}).get('message') error = default_message if message is None else message except (KeyboardInterrupt, InterruptException) as e: cursor.cancel() error = "Query cancelled by user." json_data = None except Exception as ex: json_data = None error = ex.message if not isinstance(error, basestring): error = unicode(error) return json_data, error
def __init__(self, host=None, db=None, port=None, presto_conn_id='presto_default'): self.user = '******' if not presto_conn_id: self.host = host self.db = db self.port = port else: session = settings.Session() db = session.query( Connection).filter( Connection.conn_id == presto_conn_id) if db.count() == 0: raise Exception("The presto_conn_id you provided isn't defined") else: db = db.all()[0] self.host = db.host self.db = db.schema self.catalog = 'hive' self.port = db.port self.cursor = presto.connect(host=db.host, port=db.port, username=self.user, catalog=self.catalog, schema=db.schema).cursor() session.close() # currently only a pass in pyhive
def test_invalid_kwargs(self): """some kwargs are reserved""" self.assertRaisesRegexp( ValueError, 'Cannot override', lambda: presto.connect( host=_HOST, username='******', requests_kwargs={'url': 'test'} ).cursor() )
def makeGraph(table,dt): """ Build the entire failure graph with all DNS errors found in the table at the given date. """ sys.stderr.write("Build the graph. ") # Get DNS failed requests req0 = "select dst, dns_question from %s where dns_rcode!='NOERROR' and ( dns_question like '%% A' or dns_question like '%% AAAA' ) and dt='%s' " % (table,dt); # Ignored certain request (antivirus,...) for fqdn in ignoredDomains: req0 += " and not regexp_like(dns_question, '(?i).*%s .*') " % fqdn #print req0 cursor = presto.connect('localhost').cursor() cursor.execute(req0) #Make the corresponding failure graph G = nx.Graph() count = 0 for data in cursor: addr = data[0] if data[1] is None: continue ques = data[1].split()[0] G.add_node(addr,bipartite=0) G.add_node(ques,bipartite=1) G.add_edge(addr,ques) return G
def run_query(self, query): connection = presto.connect( host=self.configuration.get('host', ''), port=self.configuration.get('port', 8080), username=self.configuration.get('username', 'redash'), catalog=self.configuration.get('catalog', 'hive'), schema=self.configuration.get('schema', 'default')) cursor = connection.cursor() try: cursor.execute(query) columns_data = [(row[0], row[1]) for row in cursor.description] columns = [{'name': col[0], 'friendly_name': col[0], 'type': PRESTO_TYPES_MAPPING.get(col[1], None)} for col in columns_data] rows = [dict(zip(([c[0] for c in columns_data]), r)) for i, r in enumerate(cursor.fetchall())] data = {'columns': columns, 'rows': rows} json_data = json.dumps(data, cls=JSONEncoder) error = None except Exception, ex: json_data = None error = ex.message
def RunQuery(self, query, parameters): cursor = presto.connect('localhost').cursor() try: cursor.execute(query, parameters) except presto.DatabaseError as e: raise res = cursor.fetchall() return res
def test_requests_session(self): with requests.Session() as session: connection = presto.connect( host=_HOST, port=_PORT, source=self.id(), requests_session=session ) cursor = connection.cursor() cursor.execute('SELECT * FROM one_row') self.assertEqual(cursor.fetchall(), [(1,)])
def test_requests_kwargs(self): connection = presto.connect( host=_HOST, port=_PORT, source=self.id(), requests_kwargs={'proxies': {'http': 'localhost:99999'}}, ) cursor = connection.cursor() self.assertRaises(requests.exceptions.ProxyError, lambda: cursor.execute('SELECT * FROM one_row'))
def test_invalid_password_and_kwargs(self): """password and requests_kwargs authentication are incompatible""" self.assertRaisesRegexp( ValueError, 'Cannot use both', lambda: presto.connect( host=_HOST, username='******', password='******', protocol='https', requests_kwargs={'auth': requests.auth.HTTPBasicAuth('user', 'secret')} ).cursor() )
def findNtpAmplifiers(table,today=datetime.date.today(),verbose=False): """ Find NTP amplifiers in the given traffic (table) and store the results in the 'ntpamplifiers' Hive table. """ date = "%d%02d%02d" % (today.year, today.month, today.day) table = scrub(table) ## set some variables regarding the input data if table.startswith("netflow"): dataType = "netflow" req0 = "select sa, sum(ibyt), sum(ipkt) from %s where sp=123 and dt='%s' and pr='UDP' and ibyt/ipkt=468 group by sa" % (table,date) elif table.startswith("sflow"): dataType = "sflow" req0 = "select srcip, sum(ipsize), count(*) from %s where udpsrcport=123 and ipprotocol=17 and ipsize=468 and dt='%s' group by srcip" % (table,date) else: sys.stderr.write("Data type unknown!") sys.exit(-1) cursor = presto.connect('localhost').cursor() if verbose: sys.stdout.write("Looking for %s NTP amplifiers... (%s)\n" % (date,table)) # get today's data cursor.execute(req0) res = cursor.fetchall() if len(res)==0: return data = pd.DataFrame(res,columns=["srcip", "nbbyt", "nbpkt"]) # add the confidence score: data["confidence"] = "LOW" data.loc[data.nbpkt>=100,"confidence"] = "MED" data.loc[data.nbpkt>=1000,"confidence"] = "HIGH" outputFile = open("%s/ntpamplifiers_%s_%s.txt" % (outputDirectory,table,date),"w") data.to_csv(outputFile,sep="\t",header=False,cols=["srcip","nbbyt","nbpkt","confidence"],index=False) outputFile.close() # Store results in Hive try: transport = TSocket.TSocket('localhost', 10000) transport = TTransport.TBufferedTransport(transport) protocol = TBinaryProtocol.TBinaryProtocol(transport) client = ThriftHive.Client(protocol) transport.open() client.execute("create table if not exists ntpamplifiers (srcip string, byte bigint, pkt bigint, confidence string) partitioned by(dt string, dataSrc string) row format delimited fields terminated by '\t'"); client.execute("load data local inpath '{dir}/ntpamplifiers_{table}_{date}.txt' overwrite into table ntpamplifiers partition (dt='{date}', dataSrc='{table}')".format(table=table,date=date,dir=outputDirectory)) transport.close() except Thrift.TException, tx: sys.stderr.write('%s\n' % (tx.message))
def get_conn(self): """Returns a connection object""" db = self.get_connection(self.presto_conn_id) return presto.connect( host=db.host, port=db.port, username=db.login, catalog=db.extra_dejson.get('catalog', 'hive'), schema=db.schema)
def find_amplifiers(): # Find NTP amplifiers and keep ip in Hive Table req = "create table ntp_amplifiers as " req += "select sa from netflow where sp=123 and pr='UDP' and ibyt/ipkt=468 group by sa" cursor = presto.connect('localhost').cursor() cursor.execute(req) res = cursor.fetchall()
def get_conn(self): """Returns a connection object""" db = self.get_connection(self.conn_id_name) return presto.connect( host=db.host, port=db.port, username=db.login, catalog=db.extra_dejson.get("catalog", "hive"), schema=db.schema, )
def find_victims(): # Request the data req = "select da, sum(ibyt) as vol from netflow where sa in (select sa from ntp_amplifiers ) group by da order by vol desc" cursor = presto.connect('localhost').cursor() cursor.execute(req) res = cursor.fetchall() for i in range(3): print res[i] plot_traffic_volume(res[i][0])
def get_conn(self): """Returns a connection object""" db = self.get_connection(self.presto_conn_id) reqkwargs = None if db.password is not None: reqkwargs = {'auth': HTTPBasicAuth(db.login, db.password)} return presto.connect( host=db.host, port=db.port, username=db.login, source=db.extra_dejson.get('source', 'airflow'), protocol=db.extra_dejson.get('protocol', 'http'), catalog=db.extra_dejson.get('catalog', 'hive'), requests_kwargs=reqkwargs, schema=db.schema)
def test_set_session_in_consructor(self): conn = presto.connect( host=_HOST, source=self.id(), session_props={'query_max_run_time': '1234m'} ) with contextlib.closing(conn): with contextlib.closing(conn.cursor()) as cursor: cursor.execute('SHOW SESSION') rows = [r for r in cursor.fetchall() if r[0] == 'query_max_run_time'] assert len(rows) == 1 session_prop = rows[0] assert session_prop[1] == '1234m' cursor.execute('RESET SESSION query_max_run_time') cursor.fetchall() cursor.execute('SHOW SESSION') rows = [r for r in cursor.fetchall() if r[0] == 'query_max_run_time'] assert len(rows) == 1 session_prop = rows[0] assert session_prop[1] != '1234m'
def test_set_session_in_constructor(self): conn = presto.connect( host=_HOST, source=self.id(), session_props={'query_max_run_time': '1234m'} ) with contextlib.closing(conn): with contextlib.closing(conn.cursor()) as cursor: cursor.execute('SHOW SESSION') rows = [r for r in cursor.fetchall() if r[0] == 'query_max_run_time'] assert len(rows) == 1 session_prop = rows[0] assert session_prop[1] == '1234m' cursor.execute('RESET SESSION query_max_run_time') cursor.fetchall() cursor.execute('SHOW SESSION') rows = [r for r in cursor.fetchall() if r[0] == 'query_max_run_time'] assert len(rows) == 1 session_prop = rows[0] assert session_prop[1] != '1234m'
def __init__(self, connection_string, username=None, password=None, proxy_user=None, impersonate=False, *args, **kwargs): presto_conf = get_presto_connection_conf(connection_string) host = presto_conf.host port = 8080 if not presto_conf.port else presto_conf.port # default to querybook credentials if user/pwd is not supplied # we pass auth credentials through requests_kwargs instead of # using requests library's builtin auth to bypass the https requirement # and set the proper Authorization header req_kwargs = {} if username and password: auth = (username, password) if proxy_user and impersonate: auth = HTTPBasicAndProxyAuth( auth, # Basic Auth (proxy_user, "no pass"), # proxy user auth, password not required ) req_kwargs["auth"] = auth connection = presto.connect( host, port=port, username=proxy_user or username, catalog=presto_conf.catalog, schema=presto_conf.schema, source="querybook", protocol=presto_conf.protocol, requests_kwargs=req_kwargs, ) self._connection = connection super(PrestoClient, self).__init__()
def run_query(self, query, user): connection = presto.connect( host=self.configuration.get("host", ""), port=self.configuration.get("port", 8080), protocol=self.configuration.get("protocol", "http"), username=self.configuration.get("username", "redash"), password=(self.configuration.get("password") or None), catalog=self.configuration.get("catalog", "hive"), schema=self.configuration.get("schema", "default"), ) cursor = connection.cursor() try: cursor.execute(query) column_tuples = [ (i[0], PRESTO_TYPES_MAPPING.get(i[1], None)) for i in cursor.description ] columns = self.fetch_columns(column_tuples) rows = [ dict(zip(([column["name"] for column in columns]), r)) for i, r in enumerate(cursor.fetchall()) ] data = {"columns": columns, "rows": rows} json_data = json_dumps(data) error = None except DatabaseError as db: json_data = None default_message = "Unspecified DatabaseError: {0}".format(str(db)) if isinstance(db.args[0], dict): message = db.args[0].get("failureInfo", {"message", None}).get( "message" ) else: message = None error = default_message if message is None else message except (KeyboardInterrupt, InterruptException, JobTimeoutException): cursor.cancel() raise return json_data, error
def connect(self, params={}): host = params.get('host') port = params.get('port', 8080) self.host = host self.port = port kwargs = { 'host': host, 'port': port, 'username': params.get('username'), 'catalog': params.get('catalog', 'hive'), 'schema': params.get('schema', 'default'), 'poll_interval': params.get('poll_interval', 1), 'source': params.get('source', 'pyhive'), } self.cursor = presto.connect(**kwargs).cursor() self.logger.info("Connecting to %s:%s " % (host, port)) self.timeout = 5
def connect(self, params={}): host = params.get("host") port = params.get("port", 8080) self.host = host self.port = port kwargs = { "host": host, "port": port, "username": params.get("username"), "catalog": params.get("catalog", "hive"), "schema": params.get("schema", "default"), "poll_interval": params.get("poll_interval", 1), "source": params.get("source", "pyhive"), } self.cursor = presto.connect(**kwargs).cursor() self.logger.info("Connecting to %s:%s " % (host, port)) self.timeout = 5
def handler(context, v3io_username, v3io_access_key, presto_api, presto_table): """ Query data from KV table via Presto. """ context.logger.info("Connecting to Presto") req_kw = { "auth": (v3io_username, v3io_access_key), "verify": False, } conn = presto.connect( presto_api, port=443, username=v3io_username, protocol="https", requests_kwargs=req_kw, ) context.logger.info("Querying Presto") df = pd.read_sql_query(f"select * from {presto_table}", conn) context.logger.info("Logging Dataset") context.log_dataset("data", df=df, format="csv", index=False)
def run_query(self, query, user): connection = presto.connect( host=self.configuration.get('host', ''), port=self.configuration.get('port', 8080), username=self.configuration.get('username', 'redash'), catalog=self.configuration.get('catalog', 'hive'), schema=self.configuration.get('schema', 'default')) cursor = connection.cursor() try: cursor.execute(query) column_tuples = [(i[0], PRESTO_TYPES_MAPPING.get(i[1], None)) for i in cursor.description] columns = self.fetch_columns(column_tuples) rows = [ dict(zip(([c['name'] for c in columns]), r)) for i, r in enumerate(cursor.fetchall()) ] data = {'columns': columns, 'rows': rows} json_data = json.dumps(data, cls=JSONEncoder) error = None except DatabaseError as db: json_data = None default_message = 'Unspecified DatabaseError: {0}'.format( db.message) message = db.message.get('failureInfo', {'message', None}).get('message') error = default_message if message is None else message except (KeyboardInterrupt, InterruptException) as e: cursor.cancel() error = "Query cancelled by user." json_data = None except Exception as ex: json_data = None error = ex.message if not isinstance(error, basestring): error = unicode(error) return json_data, error
def plot_traffic_volume(ip): # Request the data req = "select ts, ibyt from netflow where sa = '{0}' or da = '{0}'".format(ip) cursor = presto.connect('localhost').cursor() cursor.execute(req) res = cursor.fetchall() # Pre-process for ploting data = pd.DataFrame(res,columns=["ts", "bytes"]) data.index = pd.to_datetime(data.pop("ts"), unit="s") data = data.resample('H',how="sum") ix = pd.DatetimeIndex(start=data.index[0]-datetime.timedelta(0.5), end=data.index[-1]+datetime.timedelta(0.5), freq='H') data = data.reindex(ix) data.bytes = data.bytes.fillna(0) # Plot the data plt.figure() data.plot(lw=2) plt.grid(True) plt.savefig("traffic_volume_%s.png" % ip) plt.close()
def create_kpi_df(**kwargs): today = kwargs['execution_date'] + timedelta(days=1) today = today.strftime("%Y-%m-%d") cursor = presto.connect('presto.smartnews.internal', port=8081, username=OWNER).cursor() cursor.execute(sql_kpi.format(execution_date=today)) ret = cursor.fetchall() columns = [c[0] for c in cursor.description] kpi_df = pd.DataFrame(ret, columns=columns) #Creating KPI DF kpi_df['28DAU'] = kpi_df[['28DAU']].applymap(lambda x: '{:,}'.format(x)) kpi_df['MHU'] = kpi_df[['MHU']].applymap(lambda x: '{:,}'.format(x)) kpi_df['MHDAU'] = kpi_df[['MHDAU']].applymap(lambda x: '{:,}'.format(x)) kpi_df['DAU'] = kpi_df[['DAU']].applymap(lambda x: '{:,}'.format(x)) kpi_df['Installs'] = kpi_df[['Installs' ]].applymap(lambda x: '{:,}'.format(x)) #Rendering HTML global s s = kpi_df.to_html(index=False, justify='left')
def run_query(self, query): connection = presto.connect( host=self.configuration.get('host', ''), port=self.configuration.get('port', 8080), username=self.configuration.get('username', 'redash'), catalog=self.configuration.get('catalog', 'hive'), schema=self.configuration.get('schema', 'default')) cursor = connection.cursor() try: cursor.execute(query) column_tuples = [(i[0], PRESTO_TYPES_MAPPING.get(i[1], None)) for i in cursor.description] columns = self.fetch_columns(column_tuples) rows = [dict(zip(([c['name'] for c in columns]), r)) for i, r in enumerate(cursor.fetchall())] data = {'columns': columns, 'rows': rows} json_data = json.dumps(data, cls=JSONEncoder) error = None except Exception, ex: json_data = None error = ex.message
def __init__(self, host, port=8080, username=None, catalog='hive', schema='default', poll_interval=1, source='pyhive', session_props=None, protocol='http', password=None, requests_session=None, requests_kwargs=None, **kwargs): conn = presto.connect(host, port, username, catalog, schema, poll_interval, source, session_props, protocol, password, requests_session, requests_kwargs, **kwargs) cursor = conn.cursor() super(Presto, self).__init__(conn, cursor)
def train(self): b_time = time.time() cursor = presto.connect('presto.smartnews.internal', 8081).cursor() query = f""" select content_id, count(*) as count from hive_ad.z_seanchuang.i2i_offline_train_raw where dt = '{config['dt']}' and content_id != '' and behavior_types = '{config['behavior']}' group by 1 order by 2 desc limit {config['topN']} """ print(query) cursor.execute(query) column_names = [desc[0] for desc in cursor.description] df = pd.DataFrame(cursor.fetchall(), columns=column_names) self.pop_items = df['content_id'].tolist() print(df) print("Train finished ... : ", time.time() - b_time)
def worker(): while True: item = q.get() if item is None: break cursor = presto.connect(host='HOST', port=PORT, protocol='https', name='NAME', password=password, session_props={ 'query_max_run_time': timeout }).cursor() try: cursor.execute(query.format(dbname, item, item + hosesize)) i = pd.DataFrame(cursor.fetchall(), columns=[i[0] for i in cursor.description]) #Thread locking mechanism try: lock.acquire() output = pd.concat([output, i]) except Exception as err: raise err finally: lock.release() except DatabaseError as err: if err.args[0].get('message').find('Query exceeded maximum') >= 0: q.put(item) lock.acquire() logger.update({}) lock.release() else: raise else: raise q.task_done()
def data_from_hive(sql, source): # 配置presto属性 try: conn = presto.connect(host="180.150.189.61", port=26623, catalog="hive", schema=source) start = time.time() print("数据库连接完成!") try: cursor = conn.cursor() print("数据提取中......") cursor.execute(sql) # query print("数据提取完成,耗时{}\n".format(time.time() - start)) return pd.DataFrame( cursor.fetchall(), columns=list(zip(*cursor.description))[0]) # return 数据 & 列名称 finally: cursor.close() finally: conn.close()
def RunQuery(params, **kwargs): parameters = [] time_max = (datetime.utcnow() + timedelta(days=2)).strftime("%Y%m%d") time_min = (datetime.utcnow() - timedelta(days=20)).strftime("%Y%m%d") ip = None if 'time.max' in params.keys(): time_max = params['time.max'][0].strftime("%Y%m%d") if 'time.min' in params.keys(): time_min = params['time.min'][0].strftime("%Y%m%d") if 'ip' in params.keys(): ip = params['ip'] query = "select srcip,dstip,ipprotocol,tcpsrcport,udpsrcport,tcpdstport,udpdstport,unixsecondsutc from " + params['source'][0] if time_max != '' or time_min != '' or ip is not None: query += " where " if time_max != '': query += " dt<=%s" parameters.append(time_max) if time_min != '': query += " and dt>=%s" parameters.append(time_min) if ip is not None: query += " and (srcip = " + " or srcip = ".join(["%s" for i in ip]) + " or dstip = " + " or dstip = ".join(["%s" for i in ip]) + ")" print query for j in [1,2]: for i in ip: parameters.append(i) query += " order by dt" cursor = presto.connect('localhost').cursor() try: cursor.execute(query, parameters) except presto.DatabaseError as e: raise res = cursor.fetchall() return res
def printData(fieldValues): if fieldValues != None: print "beginDSInfo" print """fileName;#;true csv_first_row_has_column_names;true;true; csv_separator;|;true csv_number_grouping;,;true csv_number_decimal;.;true csv_date_format;d.M.yyyy;true""" print ''.join(['host;', fieldValues[0], ';true']) print ''.join(['port;', fieldValues[1], ';true']) print ''.join(['catalog;', fieldValues[2], ';true']) print ''.join(['schema;', fieldValues[3], ';true']) print ''.join(['query;', fieldValues[4], ';true']) print "endDSInfo" print "beginData" try: cursor = presto.connect(host=fieldValues[0], port=fieldValues[1], catalog=fieldValues[2], schema=fieldValues[3]).cursor() cursor.execute(fieldValues[4]) columns = [i[0] for i in cursor.description] print ', '.join(columns) for values in cursor.fetchall(): print ', '.join( str(value).replace(',', '.') for value in values) except Exception, e: easygui.msgbox('failed because of %s' % e.message) print "Error" print "Failed" print("endData")
def execute_presto(sql, engine): if engine is None: if 'PRESTO_ENGINE' in os.environ: engine = os.environ['PRESTO_ENGINE'] else: engine = "sn" log.info('executing...') addr = ENGINES[engine] p = presto.connect(addr['host'], port=addr['port'], catalog='hive_ad', username='******').cursor() try: p.execute(sql) d = p.fetchall() df = DataFrame(d) log.info('done') if len(d) != 0: df.columns = map(lambda x: x['name'], p._columns) return df except presto.DatabaseError as e: log.exception(e)
def presto_connector(host='localhost', port=8089, catalog='elasticsearch', schema='default', *args, **kwargs): """ presto connector API :param host: :param port: :param catalog: :param schema: :param args: :param kwargs: :return: """ logger.info("Connecting to PRESTO server [{}:{}/{}/{}]".format( host, port, catalog, schema)) return presto.connect(host=host, port=port, catalog=catalog, schema=schema, *args, **kwargs).cursor()
def execute_sql(sql): host = '172.17.2.55' p_port = u'9090' p_username = '******' conn = presto.connect(host, port=p_port, username=p_username) cursor = conn.cursor() cursor.execute(sql) columns = [col[0] for col in cursor.description] ## result = [dict(zip(columns,row)) for row in cursor.fetchall()] row = cursor.fetchall() row = list(row) row = [list(i) for i in row] result = DataFrame(row, columns=columns) return result
def train(self): b_time = time.time() cursor = presto.connect('presto.smartnews.internal', 8081).cursor() #query = f"select * from hive_ad.z_seanchuang.i2i_offline_item_topk_items where dt='{self.dt}'" query = f""" with test_item as ( select distinct content_id as item from hive_ad.z_seanchuang.i2i_offline_test_raw where dt = '{self.dt}' ) select all.* from hive_ad.z_seanchuang.i2i_offline_item_topk_items all inner join test_item test on all.item = test.item and all.dt='{self.dt}' """ cursor.execute(query) column_names = [desc[0] for desc in cursor.description] df = pd.DataFrame(cursor.fetchall(), columns=column_names) self.items_similar = df.set_index('item')['topk_json'].to_dict( defaultdict(list)) print("Train finished ... : ", time.time() - b_time)
def test_bad_protocol(self): self.assertRaisesRegexp(ValueError, 'Protocol must be', lambda: presto.connect('localhost', protocol='nonsense').cursor())
def test_invalid_protocol_config(self): """protocol should be https when passing password""" self.assertRaisesRegexp( ValueError, 'Protocol.*https.*password', lambda: presto.connect( host=_HOST, username='******', password='******', protocol='http').cursor() )
t0 = time.time() result = cursor.fetchall() delta_t = time.time() - t0 if (len(result) == 0): return ("Null", t0, delta_t) else: return (str(result[0]), t0, delta_t) #Write to file #with open("test.txt", "a") as myfile: # myfile.write(getTimeString(t0) + "," + str(query) + "," + str(result[0]) + "\n") ##Result is a 2-tuple, with time taken and TECHNOLOGY+NUM WORKER NODES (Presto6) def appendResultToFile(filename, startTime, queryID, year, timeTaken, result): outString = "{},{},{},{:.2f},{},{}\n".format(getTimeString(time.time()), queryID, year, timeTaken, result, "Presto6") with open(filename, "a") as myfile: myfile.write(outString) print(outString) cursor = presto.connect('localhost').cursor() for year in [2007,2009,2012]: for queryID in range(1,5): #Repeat 2 times for i in range(1,4): result, t0, time_taken = runTimeQuery(year, mapQuery[queryID]) appendResultToFile("PrestoQueries.txt", t0, queryID, year, time_taken, result)
def connect(self): return presto.connect(host=_HOST, source=self.id())
from string import Template #import credentials from pyhive import presto # conn_string = credentials.getRedshiftCredentials() print 'Connecting to database\n ->%s' % ('"presto.ophan.co.uk", port=8889, catalog="hive", schema="temp_ah"') conn = presto.connect("presto.ophan.co.uk", port=8889, catalog="hive", schema="temp_ah") print 'Connected' def presto_query(conn, query): cur = conn.cursor() cur.execute(query) cols = [column[0] for column in cur.description] fetch = cur.fetchall() results = [] for row in fetch: results.append(dict(zip(cols, row))) return results # define functions to execute sql queries # Series and episodes published last 30 days def series_published_sql(): return presto_query(conn, """ SELECT publication_date, section, COUNT(series_url) AS series_published_to, SUM(episodes_published_per_series) AS episodes_published FROM (SELECT publication_date, series_url, section, COUNT(episode_url) AS episodes_published_per_series
from decimal import Decimal from collections import OrderedDict from enum import Enum, unique import boto3 from boto3.dynamodb.conditions import Key import json from json import encoder encoder.FLOAT_REPR = lambda o: format(o, '.4f') logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) log = logging.getLogger(__name__) cursor = presto.connect('presto.smartnews.internal', 8081).cursor() ddb_client = boto3.client('dynamodb') dynamodb = boto3.resource('dynamodb', region_name='ap-northeast-1') # enum @unique class Action(Enum): # View = "View" # AddToCart = "AddToCart" # Purchase = "Purchase" View = "ViewContent" AddToCart = "AddToCart" Purchase = "revenue"
query = '''with a as (select data, row_number() over (range unbounded preceding) as rn from {} ) select data rn from a where rn>={} and rn<{} ''' cursor = presto.connect(host='HOST', port=PORT, protocol='https', name='NAME', password=password, session_props={ 'query_max_run_time': timeout }).cursor() #Basic query check. try: cursor.execute(query.format(dbname, 0, 100000)) cursor.fetchall() except DatabaseError as err: if err.args[0].get('message').find('Query exceeded maximum') >= 0: pass else: raise else: raise
def presto_query(query): import pandas as pd from pyhive import presto conn = presto.connect(host='presto-server', port=8080) return pd.read_sql_query(query, conn, parse_dates=['timestamp'])
import pandas as pd import sqlalchemy from pyhive import presto query = ''' SELECT wbanno, min(temperature) t_min, avg(temperature) t_avg, max(temperature) t_max FROM uscrn GROUP BY 1 ''' conn = presto.connect(host='presto-server', port=8080) features = pd.read_sql_query(query, conn) print(features) uri = "postgresql://*****:*****@datamart:5432/datamart" engine = sqlalchemy.create_engine(uri) features.to_sql('temperature_features', index=False, if_exists='replace', con=engine) print('temperature_features created')
banco = client['john'] collection = banco['nba'] # collection = banco.album # ELASTIC es = Elasticsearch(hosts= server) # REDIS r = redis.Redis( host=server, port=6379) #CONNECT PRESTO connect = presto.connect(host=server,port=8080) def read_presto(): df = pd.read_sql_query('select * from elastic.default.john',connect) print(df.head()) def write_mongo(data_dict): try: data_id = collection.insert_many(data_dict) print(data_id) except NameError: print(NameError) def read_csv(path): data = pd.read_csv(path)
def get_presto_connection(attemps=3): return presto.connect(host=HOST, port=DB_PORT)
def query_age_price(self): conn = presto.connect(**PRESTO_SERVER) cursor = conn.cursor() cursor.execute(AGE_PRICE_QUERY) tuples = cursor.fetchall() return tuples
def get_connection(): return presto.connect(**presto_config)
#!/usr/bin/env python import logging logging.basicConfig(level=logging.DEBUG) # from pyhive import hive # conn = hive.Connection(host="localhost", port=9083, username="******") from pyhive import presto cursor = presto.connect(host="localhost", port=10000, username="******").cursor() sql = 'select * from default limit 10' cursor.execute(sql) print(cursor.fetchone()) print(cursor.fetchall())