コード例 #1
0
def cdf():

    config = ConfigParser.RawConfigParser(allow_no_value=True)
    config.read('../config/config.cfg')

    hive_client = hive_utils.HiveClient(server=config.get('Hive', 'server'),
                                        port=config.get('Hive', 'port'))

    base_query = 'select count(*) as cnt from (select count(*) as num from checkin group by user_id) a where a.num > '

    ax = []
    ay = []

    st = 5
    tot_user = 2286501.0

    while True:

        count_query = base_query + str(st)
        cnt_int = 0

        for row in hive_client.execute(count_query):
            ax.append(st)
            cnt_per = float(row['cnt']) / tot_user
            #print type(row['cnt'])
            ay.append(cnt_per)
            print row['cnt'], cnt_per

        st = st + 50

        if int(row['cnt']) <= 1:
            break

        print count_query

    fig, axi = plt.subplots()
    axi.set_yscale("log")
    axi.set_xscale("log")
    axi.set_ylim(0, 1e6)
    axi.set_ylim(1e-10, 1e0)
    axi.set_xlabel('Number of checkins per user')
    axi.set_ylabel('CCDF')

    plt.plot(ax, ay, 'o')
    plt.savefig('cdf_test.png')
コード例 #2
0
def df():

    config = ConfigParser.RawConfigParser(allow_no_value=True)
    config.read('../config/config.cfg')

    hive_client = hive_utils.HiveClient(server=config.get('Hive', 'server'),
                                        port=config.get('Hive', 'port'))

    #Select top n check-in countries
    top10_country = "select country,count(*) as count from checkintest group by country sort by count desc limit 10"
    top10_state_us = "select state,count(*) as count from checkintest where country='US' group by state sort by count desc limit 10"
    top10_city_us = "select city,count(*) as count from checkintest where country='US' group by city sort by count desc limit 10"
    country_label = []
    country_cnt = []

    for row in hive_client.execute(top10_country):
        country_label.append(row['country'])
        country_cnt.append(int(row['count']))

    us_state_label = []
    us_state_cnt = []

    for row in hive_client.execute(top10_state_us):
        us_state_label.append(row['state'])
        us_state_cnt.append(int(row['count']))

    us_city_label = []
    us_city_cnt = []

    for row in hive_client.execute(top10_city_us):
        us_city_label.append(row['city'])
        us_city_cnt.append(int(row['count']))

    plot_bar_chart(country_cnt, country_label, 'Check-in Count',
                   'Top 10 Check-in countries', 'top10_country.png')
    plot_bar_chart(us_state_cnt, us_state_label, 'Check-in Count',
                   'Top 10 Check-in states in US', 'top10_state_us.png')
    plot_bar_chart(us_city_cnt, us_city_label, 'Check-in Count',
                   'Top 10 Check-in cities in US', 'top10_city_us.png')
コード例 #3
0
ファイル: database.py プロジェクト: Ameniabdelhamid/hadoop
LINES TERMINATED BY '\n'
STORED AS TEXTFILE;
"""

query1 = """
LOAD DATA LOCAL INPATH '/home/ameni/PycharmProjects/Product_Service/productdb.csv'
INTO TABLE products.catalog;
"""

query2 = """
SELECT * FROM products.catalog LIMIT 5;
"""

hive_client = hive_utils.HiveClient(
    server='0.0.0.0',
    port=10000,
    db='products'
)
a = hive_client.execute(query)
b = hive_client.execute(query1)
c = hive_client.execute(query2)
c = list(c)

import pyhs2
with pyhs2.connect(host='localhost',
           port=10000,
           authMechanism="PLAIN",
           user='******',
           password='******',
           database='your_default_db') as conn:
        with conn.cursor() as cur:
コード例 #4
0
import hive_utils
from hive_service.ttypes import HiveServerException

# get the input parameters
if len(sys.argv) != 6:
    print 'Usage: python hiveserver1-client.py <hive_host> <hive_port> <db_name> <hadoop_user> <hadoop_password>'
    sys.exit()

hiveHost = sys.argv[1]
hivePort = sys.argv[2]
dbName = sys.argv[3]
hadoopUser = sys.argv[4]
hadoopPassword = sys.argv[5]

# do the connection
client = hive_utils.HiveClient(server=hiveHost, port=hivePort, db=dbName)

# create a loop attending HiveQL queries
while (1):
    query = raw_input('remotehive> ')

    try:
        if not query:
            continue

        if query == 'exit':
            sys.exit()

        # execute the query
        for row in client.execute(query):
            print row