Example #1
0
    def write(self, df, destination):
        logger.info('Hive3Writer.write(destination={0})'.format(destination))
        from pyspark_llap import HiveWarehouseSession
        hive = HiveWarehouseSession.session(self.spark).build()
        if destination.create_database:
            hive.createDatabase(destination.database, True)
        dst = '{0}.{1}'.format(destination.database, destination.table)
        if destination.create_backup:
            if self._table_exists(hive, destination):
                backup_table = '{0}{1}'.format(dst, DEFAULT_BACKUP_SUFFIX)
                hive.dropTable(backup_table, True, True)
                hive.executeUpdate('ALTER TABLE {0} RENAME TO {1}'.format(
                    dst, backup_table))

        writer = df.write \
         .mode(destination.write_disposition) \
         .format(HiveWarehouseSession.HIVE_WAREHOUSE_CONNECTOR) \
         .option('table', dst)

        if destination.options:
            for key, value in destination.options.iteritems():
                writer = writer.option(key, value)

        if destination.partition_by:
            writer = writer.partitionBy(*destination.partition_by)

        writer.save()

        if destination.compute_statistics:
            hive.executeUpdate(
                'ANALYZE TABLE {0} COMPUTE STATISTICS'.format(dst))
Example #2
0
 def read(self, source):
     logger.info('Hive3Reader.read(source={0})'.format(source))
     query = source.query if source.query else 'SELECT * FROM {0}.{1}'.format(
         source.database, source.table)
     from pyspark_llap import HiveWarehouseSession
     hive = HiveWarehouseSession.session(self.spark).build()
     df = hive.executeQuery(query)
     return df
Example #3
0
import os, sys
from pyspark_llap import HiveWarehouseSession
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName(
    'PySparkTestHWC').enableHiveSupport().getOrCreate()
hive = HiveWarehouseSession.session(spark).build()
hive.showDatabases().show()
hive.executeQuery("select * from test_tbl limit 10").show()
Example #4
0
Inserting data into a Hive table failed with permission errors on hdfs folder /tmp/hive/xxxx. Issue got resolved once the folder is given access

hdfs dfs -chmod -R 777 /tmp/hive

open pyspark shell:
===================
pyspark --jars /usr/hdp/current/hive_warehouse_connector/hive-warehouse-connector-assembly-1.0.0.3.1.0.0-78.jar --py-files /usr/hdp/current/hive_warehouse_connector/pyspark_hwc-1.0.0.3.1.0.0-78.zip

import os,sys
from pyspark_llap import HiveWarehouseSession
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('PySparkTest').getOrCreate()
hive = HiveWarehouseSession.session(spark).build()
hive.showDatabases().show()
hive.showTables().show()

query:
mydf = hive.executeQuery("select * from t1")

create a hive table:
hive.createTable("arun_test").ifNotExists().column("col1","string").column("col2","string").create()

write data into hive table:
mydf.write.format(HiveWarehouseSession().HIVE_WAREHOUSE_CONNECTOR).mode("append").option("table","arun_test").save()
Example #5
0
def scraping():
    product_num = 0
    productList = []
    now = datetime.datetime.now()
    current_date = now.strftime("%Y-%m-%d %H:%M:%S")
    mainpage = "https://www.amazon.es/gp/bestsellers/electronics/ref=zg_bs_nav_0"
    settings = [
        ('spark.sql.hive.hiveserver2.jdbc.url',
         'jdbc:hive2://sandbox-hdp.hortonworks.com:10000/default'),
    ]

    for i in range(2):
        #content = requests.get(mainpage)#, headers=self.header)
        req = urllib2.Request(mainpage)
        content = urllib2.urlopen(req).read()
        #bs = BeautifulSoup(content.text, 'html.parser')
        bs = BeautifulSoup(content, 'html.parser')
        data = bs.find('div', {"id": "zg-center-div"})
        bs = BeautifulSoup(str(data), 'html.parser')
        products = bs.find_all('li', {"class": "zg-item-immersion"})

        for product in products:
            print("------------------------------")
            product_id = product.find('span', {"class": "zg-badge-text"}).text
            product_id = product_id[1:]  # Hace falta quitarle el #
            print(product_id)
            product_link_info = product.find('a', {"class": "a-link-normal"})
            product_url = "https://www.amazon.es"
            if product_link_info is not None:
                product_url = product_url + product_link_info['href']
            print(product_url)
            product_image_info = product.find('span', {
                "class": "zg-text-center-align"
            }).find('img')
            if product_image_info is not None:
                product_image = product_image_info['src']
                product_title_text = product_image_info['alt'].encode(
                    'utf-8').decode('ascii', 'ignore')
                product_title = product_title_text.replace("'", "")
            else:
                product_image = "None"
                product_title = "None"
            print(product_image)
            # product_title = product.find('span', {"class":"zg-text-center-align"}).find('img')['alt']
            # product_title_text = product_title.encode('utf-8').decode('ascii', 'ignore')
            # product_title = product_title_text.replace("'", "")
            print(product_title)
            product_price_info = product.find('span',
                                              {"class": "p13n-sc-price"})
            if product_price_info is not None:
                product_price_text = product.find('span', {
                    "class": "p13n-sc-price"
                }).text
                product_price = product_price_text[:-2].replace(',', '.')
            else:
                product_price = "0"
            print(product_price)
            product_stars_info = product.find('div', {"class": "a-icon-row"})
            if product_stars_info is not None:
                product_stars_text = product_stars_info.find(
                    'span', {
                        "class": "a-icon-alt"
                    }).text
                product_stars = product_stars_text[:3].replace(',', '.')
                product_stars_count_text = product_stars_info.find(
                    'a', {
                        "class": "a-size-small"
                    }).text
                product_stars_count = product_stars_count_text.replace('.', '')
            else:
                product_stars = "0"
                product_stars_count = "0"
            print(product_stars)
            print(product_stars_count)
            product_prime_info = product.find('i', {"class": "a-icon-prime"})
            if product_prime_info is not None:
                product_prime = "1"
            else:
                product_prime = "0"
            print(product_prime)

            #productList[product_num] = [current_date, product_id, product_title, product_url, product_image, product_stars, product_stars_count, product_price, product_prime]
            productList.append([
                current_date, product_id, product_title, product_url,
                product_image, product_stars, product_stars_count,
                product_price, product_prime
            ])
            product_num += 1

        next_page_info = bs.find('ul', {
            "class": "a-pagination"
        }).find('li', {
            "class": "a-last"
        }).find('a')
        if next_page_info is not None:
            next_page = next_page_info['href']
            mainpage = next_page
            print(next_page)
            time.sleep(60)
        else:
            print("THE END - MANAGEMENT OF DATA")

            conf = SparkConf().setAppName("Pyspark and Hive!").setAll(settings)
            #Spark 2: use SparkSession instead of SparkContext.
            spark = (
                SparkSession.builder.config(conf=conf)
                # There is no HiveContext anymore either.
                .enableHiveSupport().getOrCreate())
            hive = HiveWarehouseSession.session(spark).userPassword(
                'hive', 'hive').build()

            df = pd.DataFrame.from_records(productList)
            df.columns = [
                'ScrapDate', 'Id', 'Title', 'Url', 'Image', 'Stars',
                'StarsCount', 'Price', 'Prime'
            ]
            df.to_csv('AmazonProducts' + now.strftime("%m%d%Y%H%M%S") + '.csv',
                      index=False)

            subprocess.call('hdfs dfs -copyFromLocal ./AmazonProducts' +
                            now.strftime("%m%d%Y%H%M%S") +
                            '.csv hdfs://sandbox-hdp.hortonworks.com:8020/tmp',
                            shell=True)

            hive.executeUpdate("CREATE DATABASE IF NOT EXISTS amazontfm")
            hive.setDatabase("amazontfm")
            hive.executeUpdate(
                "CREATE TABLE IF NOT EXISTS bestsellers(ScrapDate timestamp, Id int, Title string, Url string, Image string, Stars decimal(6,2), StarsCount int, Price decimal(6,2), Prime int)"
            )
            for index, row in df.iterrows():
                print("Inserting row: " + str(index))
                print("INSERT INTO bestsellers VALUES('" + row["ScrapDate"] +
                      "', " + row["Id"] + ", '" + row["Title"] + "', '" +
                      row["Url"] + "', '" + row["Image"] + "', " +
                      row["Stars"] + ", " + row["StarsCount"] + ", " +
                      row["Price"] + ", " + row["Prime"] + ")")
                hive.executeUpdate(
                    "INSERT INTO bestsellers (ScrapDate, Id, Title, Url, Image, Stars, StarsCount, Price, Prime) VALUES('"
                    + row["ScrapDate"] + "', " + row["Id"] + ", '" +
                    row["Title"] + "', '" + row["Url"] + "', '" +
                    row["Image"] + "', " + row["Stars"] + ", " +
                    row["StarsCount"] + ", " + row["Price"] + ", " +
                    row["Prime"] + ")")
                print("Row inserted: " + str(index))

            print("THE END")