Python HiveWarehouseSessionの例

プログラミング言語: Python

名前空間/パッケージ名: pyspark_llap

hotexamples.comのコード掲載数: 5

Python HiveWarehouseSession - 5件のコード例が見つかりました。すべてオープンソースプロジェクトから抽出されたPythonのpyspark_llap.HiveWarehouseSessionの実例で、最も評価が高いものを厳選しています。コード例の評価を行っていただくことで、より質の高いコード例が表示されるようになります。

よく使われるメソッド

表示非表示

session(4)

HiveWarehouseSession(1)

よく使われるメソッド

session (4)

HiveWarehouseSession (1)

コード例 #1

ファイルを表示

ファイル: skoop.py プロジェクト: natefleming/skoop

    def write(self, df, destination):
        logger.info('Hive3Writer.write(destination={0})'.format(destination))
        from pyspark_llap import HiveWarehouseSession
        hive = HiveWarehouseSession.session(self.spark).build()
        if destination.create_database:
            hive.createDatabase(destination.database, True)
        dst = '{0}.{1}'.format(destination.database, destination.table)
        if destination.create_backup:
            if self._table_exists(hive, destination):
                backup_table = '{0}{1}'.format(dst, DEFAULT_BACKUP_SUFFIX)
                hive.dropTable(backup_table, True, True)
                hive.executeUpdate('ALTER TABLE {0} RENAME TO {1}'.format(
                    dst, backup_table))

        writer = df.write \
         .mode(destination.write_disposition) \
         .format(HiveWarehouseSession.HIVE_WAREHOUSE_CONNECTOR) \
         .option('table', dst)

        if destination.options:
            for key, value in destination.options.iteritems():
                writer = writer.option(key, value)

        if destination.partition_by:
            writer = writer.partitionBy(*destination.partition_by)

        writer.save()

        if destination.compute_statistics:
            hive.executeUpdate(
                'ANALYZE TABLE {0} COMPUTE STATISTICS'.format(dst))

コード例 #2

ファイルを表示

ファイル: skoop.py プロジェクト: natefleming/skoop

 def read(self, source):
     logger.info('Hive3Reader.read(source={0})'.format(source))
     query = source.query if source.query else 'SELECT * FROM {0}.{1}'.format(
         source.database, source.table)
     from pyspark_llap import HiveWarehouseSession
     hive = HiveWarehouseSession.session(self.spark).build()
     df = hive.executeQuery(query)
     return df

コード例 #3

ファイルを表示

ファイル: sparkhive.py プロジェクト: arunnatva/spark

import os, sys
from pyspark_llap import HiveWarehouseSession
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName(
    'PySparkTestHWC').enableHiveSupport().getOrCreate()
hive = HiveWarehouseSession.session(spark).build()
hive.showDatabases().show()
hive.executeQuery("select * from test_tbl limit 10").show()

コード例 #4

ファイルを表示

ファイル: pyspark-hwc.py プロジェクト: arunnatva/spark

Inserting data into a Hive table failed with permission errors on hdfs folder /tmp/hive/xxxx. Issue got resolved once the folder is given access

hdfs dfs -chmod -R 777 /tmp/hive

open pyspark shell:
===================
pyspark --jars /usr/hdp/current/hive_warehouse_connector/hive-warehouse-connector-assembly-1.0.0.3.1.0.0-78.jar --py-files /usr/hdp/current/hive_warehouse_connector/pyspark_hwc-1.0.0.3.1.0.0-78.zip

import os,sys
from pyspark_llap import HiveWarehouseSession
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('PySparkTest').getOrCreate()
hive = HiveWarehouseSession.session(spark).build()
hive.showDatabases().show()
hive.showTables().show()

query:
mydf = hive.executeQuery("select * from t1")

create a hive table:
hive.createTable("arun_test").ifNotExists().column("col1","string").column("col2","string").create()

write data into hive table:
mydf.write.format(HiveWarehouseSession().HIVE_WAREHOUSE_CONNECTOR).mode("append").option("table","arun_test").save()

コード例 #5

ファイルを表示

ファイル: mainAmazon.py プロジェクト: pfa13/pfa13.github.io

def scraping():
    product_num = 0
    productList = []
    now = datetime.datetime.now()
    current_date = now.strftime("%Y-%m-%d %H:%M:%S")
    mainpage = "https://www.amazon.es/gp/bestsellers/electronics/ref=zg_bs_nav_0"
    settings = [
        ('spark.sql.hive.hiveserver2.jdbc.url',
         'jdbc:hive2://sandbox-hdp.hortonworks.com:10000/default'),
    ]

    for i in range(2):
        #content = requests.get(mainpage)#, headers=self.header)
        req = urllib2.Request(mainpage)
        content = urllib2.urlopen(req).read()
        #bs = BeautifulSoup(content.text, 'html.parser')
        bs = BeautifulSoup(content, 'html.parser')
        data = bs.find('div', {"id": "zg-center-div"})
        bs = BeautifulSoup(str(data), 'html.parser')
        products = bs.find_all('li', {"class": "zg-item-immersion"})

        for product in products:
            print("------------------------------")
            product_id = product.find('span', {"class": "zg-badge-text"}).text
            product_id = product_id[1:]  # Hace falta quitarle el #
            print(product_id)
            product_link_info = product.find('a', {"class": "a-link-normal"})
            product_url = "https://www.amazon.es"
            if product_link_info is not None:
                product_url = product_url + product_link_info['href']
            print(product_url)
            product_image_info = product.find('span', {
                "class": "zg-text-center-align"
            }).find('img')
            if product_image_info is not None:
                product_image = product_image_info['src']
                product_title_text = product_image_info['alt'].encode(
                    'utf-8').decode('ascii', 'ignore')
                product_title = product_title_text.replace("'", "")
            else:
                product_image = "None"
                product_title = "None"
            print(product_image)
            # product_title = product.find('span', {"class":"zg-text-center-align"}).find('img')['alt']
            # product_title_text = product_title.encode('utf-8').decode('ascii', 'ignore')
            # product_title = product_title_text.replace("'", "")
            print(product_title)
            product_price_info = product.find('span',
                                              {"class": "p13n-sc-price"})
            if product_price_info is not None:
                product_price_text = product.find('span', {
                    "class": "p13n-sc-price"
                }).text
                product_price = product_price_text[:-2].replace(',', '.')
            else:
                product_price = "0"
            print(product_price)
            product_stars_info = product.find('div', {"class": "a-icon-row"})
            if product_stars_info is not None:
                product_stars_text = product_stars_info.find(
                    'span', {
                        "class": "a-icon-alt"
                    }).text
                product_stars = product_stars_text[:3].replace(',', '.')
                product_stars_count_text = product_stars_info.find(
                    'a', {
                        "class": "a-size-small"
                    }).text
                product_stars_count = product_stars_count_text.replace('.', '')
            else:
                product_stars = "0"
                product_stars_count = "0"
            print(product_stars)
            print(product_stars_count)
            product_prime_info = product.find('i', {"class": "a-icon-prime"})
            if product_prime_info is not None:
                product_prime = "1"
            else:
                product_prime = "0"
            print(product_prime)

            #productList[product_num] = [current_date, product_id, product_title, product_url, product_image, product_stars, product_stars_count, product_price, product_prime]
            productList.append([
                current_date, product_id, product_title, product_url,
                product_image, product_stars, product_stars_count,
                product_price, product_prime
            ])
            product_num += 1

        next_page_info = bs.find('ul', {
            "class": "a-pagination"
        }).find('li', {
            "class": "a-last"
        }).find('a')
        if next_page_info is not None:
            next_page = next_page_info['href']
            mainpage = next_page
            print(next_page)
            time.sleep(60)
        else:
            print("THE END - MANAGEMENT OF DATA")

            conf = SparkConf().setAppName("Pyspark and Hive!").setAll(settings)
            #Spark 2: use SparkSession instead of SparkContext.
            spark = (
                SparkSession.builder.config(conf=conf)
                # There is no HiveContext anymore either.
                .enableHiveSupport().getOrCreate())
            hive = HiveWarehouseSession.session(spark).userPassword(
                'hive', 'hive').build()

            df = pd.DataFrame.from_records(productList)
            df.columns = [
                'ScrapDate', 'Id', 'Title', 'Url', 'Image', 'Stars',
                'StarsCount', 'Price', 'Prime'
            ]
            df.to_csv('AmazonProducts' + now.strftime("%m%d%Y%H%M%S") + '.csv',
                      index=False)

            subprocess.call('hdfs dfs -copyFromLocal ./AmazonProducts' +
                            now.strftime("%m%d%Y%H%M%S") +
                            '.csv hdfs://sandbox-hdp.hortonworks.com:8020/tmp',
                            shell=True)

            hive.executeUpdate("CREATE DATABASE IF NOT EXISTS amazontfm")
            hive.setDatabase("amazontfm")
            hive.executeUpdate(
                "CREATE TABLE IF NOT EXISTS bestsellers(ScrapDate timestamp, Id int, Title string, Url string, Image string, Stars decimal(6,2), StarsCount int, Price decimal(6,2), Prime int)"
            )
            for index, row in df.iterrows():
                print("Inserting row: " + str(index))
                print("INSERT INTO bestsellers VALUES('" + row["ScrapDate"] +
                      "', " + row["Id"] + ", '" + row["Title"] + "', '" +
                      row["Url"] + "', '" + row["Image"] + "', " +
                      row["Stars"] + ", " + row["StarsCount"] + ", " +
                      row["Price"] + ", " + row["Prime"] + ")")
                hive.executeUpdate(
                    "INSERT INTO bestsellers (ScrapDate, Id, Title, Url, Image, Stars, StarsCount, Price, Prime) VALUES('"
                    + row["ScrapDate"] + "', " + row["Id"] + ", '" +
                    row["Title"] + "', '" + row["Url"] + "', '" +
                    row["Image"] + "', " + row["Stars"] + ", " +
                    row["StarsCount"] + ", " + row["Price"] + ", " +
                    row["Prime"] + ")")
                print("Row inserted: " + str(index))

            print("THE END")