def write(self, df, destination): logger.info('Hive3Writer.write(destination={0})'.format(destination)) from pyspark_llap import HiveWarehouseSession hive = HiveWarehouseSession.session(self.spark).build() if destination.create_database: hive.createDatabase(destination.database, True) dst = '{0}.{1}'.format(destination.database, destination.table) if destination.create_backup: if self._table_exists(hive, destination): backup_table = '{0}{1}'.format(dst, DEFAULT_BACKUP_SUFFIX) hive.dropTable(backup_table, True, True) hive.executeUpdate('ALTER TABLE {0} RENAME TO {1}'.format( dst, backup_table)) writer = df.write \ .mode(destination.write_disposition) \ .format(HiveWarehouseSession.HIVE_WAREHOUSE_CONNECTOR) \ .option('table', dst) if destination.options: for key, value in destination.options.iteritems(): writer = writer.option(key, value) if destination.partition_by: writer = writer.partitionBy(*destination.partition_by) writer.save() if destination.compute_statistics: hive.executeUpdate( 'ANALYZE TABLE {0} COMPUTE STATISTICS'.format(dst))
def read(self, source): logger.info('Hive3Reader.read(source={0})'.format(source)) query = source.query if source.query else 'SELECT * FROM {0}.{1}'.format( source.database, source.table) from pyspark_llap import HiveWarehouseSession hive = HiveWarehouseSession.session(self.spark).build() df = hive.executeQuery(query) return df
import os, sys from pyspark_llap import HiveWarehouseSession from pyspark.sql import SparkSession spark = SparkSession.builder.appName( 'PySparkTestHWC').enableHiveSupport().getOrCreate() hive = HiveWarehouseSession.session(spark).build() hive.showDatabases().show() hive.executeQuery("select * from test_tbl limit 10").show()
Inserting data into a Hive table failed with permission errors on hdfs folder /tmp/hive/xxxx. Issue got resolved once the folder is given access hdfs dfs -chmod -R 777 /tmp/hive open pyspark shell: =================== pyspark --jars /usr/hdp/current/hive_warehouse_connector/hive-warehouse-connector-assembly-1.0.0.3.1.0.0-78.jar --py-files /usr/hdp/current/hive_warehouse_connector/pyspark_hwc-1.0.0.3.1.0.0-78.zip import os,sys from pyspark_llap import HiveWarehouseSession from pyspark.sql import SparkSession spark = SparkSession.builder.appName('PySparkTest').getOrCreate() hive = HiveWarehouseSession.session(spark).build() hive.showDatabases().show() hive.showTables().show() query: mydf = hive.executeQuery("select * from t1") create a hive table: hive.createTable("arun_test").ifNotExists().column("col1","string").column("col2","string").create() write data into hive table: mydf.write.format(HiveWarehouseSession().HIVE_WAREHOUSE_CONNECTOR).mode("append").option("table","arun_test").save()
def scraping(): product_num = 0 productList = [] now = datetime.datetime.now() current_date = now.strftime("%Y-%m-%d %H:%M:%S") mainpage = "https://www.amazon.es/gp/bestsellers/electronics/ref=zg_bs_nav_0" settings = [ ('spark.sql.hive.hiveserver2.jdbc.url', 'jdbc:hive2://sandbox-hdp.hortonworks.com:10000/default'), ] for i in range(2): #content = requests.get(mainpage)#, headers=self.header) req = urllib2.Request(mainpage) content = urllib2.urlopen(req).read() #bs = BeautifulSoup(content.text, 'html.parser') bs = BeautifulSoup(content, 'html.parser') data = bs.find('div', {"id": "zg-center-div"}) bs = BeautifulSoup(str(data), 'html.parser') products = bs.find_all('li', {"class": "zg-item-immersion"}) for product in products: print("------------------------------") product_id = product.find('span', {"class": "zg-badge-text"}).text product_id = product_id[1:] # Hace falta quitarle el # print(product_id) product_link_info = product.find('a', {"class": "a-link-normal"}) product_url = "https://www.amazon.es" if product_link_info is not None: product_url = product_url + product_link_info['href'] print(product_url) product_image_info = product.find('span', { "class": "zg-text-center-align" }).find('img') if product_image_info is not None: product_image = product_image_info['src'] product_title_text = product_image_info['alt'].encode( 'utf-8').decode('ascii', 'ignore') product_title = product_title_text.replace("'", "") else: product_image = "None" product_title = "None" print(product_image) # product_title = product.find('span', {"class":"zg-text-center-align"}).find('img')['alt'] # product_title_text = product_title.encode('utf-8').decode('ascii', 'ignore') # product_title = product_title_text.replace("'", "") print(product_title) product_price_info = product.find('span', {"class": "p13n-sc-price"}) if product_price_info is not None: product_price_text = product.find('span', { "class": "p13n-sc-price" }).text product_price = product_price_text[:-2].replace(',', '.') else: product_price = "0" print(product_price) product_stars_info = product.find('div', {"class": "a-icon-row"}) if product_stars_info is not None: product_stars_text = product_stars_info.find( 'span', { "class": "a-icon-alt" }).text product_stars = product_stars_text[:3].replace(',', '.') product_stars_count_text = product_stars_info.find( 'a', { "class": "a-size-small" }).text product_stars_count = product_stars_count_text.replace('.', '') else: product_stars = "0" product_stars_count = "0" print(product_stars) print(product_stars_count) product_prime_info = product.find('i', {"class": "a-icon-prime"}) if product_prime_info is not None: product_prime = "1" else: product_prime = "0" print(product_prime) #productList[product_num] = [current_date, product_id, product_title, product_url, product_image, product_stars, product_stars_count, product_price, product_prime] productList.append([ current_date, product_id, product_title, product_url, product_image, product_stars, product_stars_count, product_price, product_prime ]) product_num += 1 next_page_info = bs.find('ul', { "class": "a-pagination" }).find('li', { "class": "a-last" }).find('a') if next_page_info is not None: next_page = next_page_info['href'] mainpage = next_page print(next_page) time.sleep(60) else: print("THE END - MANAGEMENT OF DATA") conf = SparkConf().setAppName("Pyspark and Hive!").setAll(settings) #Spark 2: use SparkSession instead of SparkContext. spark = ( SparkSession.builder.config(conf=conf) # There is no HiveContext anymore either. .enableHiveSupport().getOrCreate()) hive = HiveWarehouseSession.session(spark).userPassword( 'hive', 'hive').build() df = pd.DataFrame.from_records(productList) df.columns = [ 'ScrapDate', 'Id', 'Title', 'Url', 'Image', 'Stars', 'StarsCount', 'Price', 'Prime' ] df.to_csv('AmazonProducts' + now.strftime("%m%d%Y%H%M%S") + '.csv', index=False) subprocess.call('hdfs dfs -copyFromLocal ./AmazonProducts' + now.strftime("%m%d%Y%H%M%S") + '.csv hdfs://sandbox-hdp.hortonworks.com:8020/tmp', shell=True) hive.executeUpdate("CREATE DATABASE IF NOT EXISTS amazontfm") hive.setDatabase("amazontfm") hive.executeUpdate( "CREATE TABLE IF NOT EXISTS bestsellers(ScrapDate timestamp, Id int, Title string, Url string, Image string, Stars decimal(6,2), StarsCount int, Price decimal(6,2), Prime int)" ) for index, row in df.iterrows(): print("Inserting row: " + str(index)) print("INSERT INTO bestsellers VALUES('" + row["ScrapDate"] + "', " + row["Id"] + ", '" + row["Title"] + "', '" + row["Url"] + "', '" + row["Image"] + "', " + row["Stars"] + ", " + row["StarsCount"] + ", " + row["Price"] + ", " + row["Prime"] + ")") hive.executeUpdate( "INSERT INTO bestsellers (ScrapDate, Id, Title, Url, Image, Stars, StarsCount, Price, Prime) VALUES('" + row["ScrapDate"] + "', " + row["Id"] + ", '" + row["Title"] + "', '" + row["Url"] + "', '" + row["Image"] + "', " + row["Stars"] + ", " + row["StarsCount"] + ", " + row["Price"] + ", " + row["Prime"] + ")") print("Row inserted: " + str(index)) print("THE END")