def main(self, sc, *args): sql_context = SQLContext(sc) df = self.read_from_hdfs(sql_context) df = self.count_occurrence(df) self.write_to_hdfs(df)
from dataMining.checkDB import checkStatus from dataMining.databaseInit import initializeDatabase from dataMining.mongoQuery import askMongo from flask_uploads import UploadSet, configure_uploads from flask import Flask, render_template, request, send_file findspark.init() app = Flask(__name__) app.config.from_pyfile('config.py') sc = SparkContext(appName="Yelp") sc.setLogLevel("ERROR") sqlc = SQLContext(sc) with open('../data/Charlotte_Restaurants_review.pickle', 'rb') as f: all_visited = pickle.load(f) with open('../data/Charlotte_Restaurants_business.pickle', 'rb') as f: rest = pickle.load(f) with open('../data/Charlotte_Restaurants_user.pickle', 'rb') as f: user = pickle.load(f) n_business = len(rest) best_model = ALSModel.load('../data/Charlotte_als_model')
def main_acr_preprocess(): # def main(): parameter = load_json_config("./parameter.json") list_args = parameter["acr_preprocess"] DATA_DIR = parameter["DATA_DIR"] path_pickle = DATA_DIR + list_args["path_pickle"] path_tf_record = DATA_DIR + list_args["path_tf_record"] input_word_embeddings_path = DATA_DIR + list_args[ "input_word_embeddings_path"] vocab_most_freq_words = list_args["vocab_most_freq_words"] max_words_length = list_args["max_words_length"] output_word_vocab_embeddings_path = DATA_DIR + list_args[ "output_word_vocab_embeddings_path"] output_label_encoders = DATA_DIR + list_args["output_label_encoders"] output_tf_records_path = DATA_DIR + list_args["output_tf_records_path"] output_articles_csv_path_preprocessed = DATA_DIR + list_args[ "output_articles_csv_path_preprocessed"] output_articles_csv_path_original = DATA_DIR + list_args[ "output_articles_csv_path_original"] articles_by_tfrecord = list_args["articles_by_tfrecord"] mysql_host = list_args["mysql_host"] mysql_user = list_args["mysql_user"] mysql_passwd = list_args["mysql_passwd"] mysql_database = list_args["mysql_database"] mysql_table = list_args["mysql_table"] domain = list_args["domain"] print("<=== STARTING ARC PREPROCESS ===>") spark = spark_inital() sc = spark.sparkContext sqlContext = SQLContext(sc) # ACR PREPROCESS if path.exists(output_label_encoders): pass else: os.makedirs(output_label_encoders) os.makedirs(output_word_vocab_embeddings_path) os.makedirs(output_articles_csv_path_preprocessed) os.makedirs(output_articles_csv_path_original) if path.exists(path_tf_record): pass else: os.makedirs(path_tf_record) list_args_2 = parameter["acr_training"] acr_path = DATA_DIR + list_args_2["output_acr_metadata_embeddings_path"] acr_label_encoders, articles_metadata_df, content_article_embeddings = deserialize( get_all_file(acr_path)[0]) isEmpty = 0 # DATABASE NEWS FOMR MYSQL news_df_from_mysql = handle_database_news(domain, mysql_host, mysql_user, mysql_passwd, mysql_database, mysql_table, list_args['date_start'], list_args["date_end"]) # filter list_cul = [ 'catId', 'content', 'email', 'newsId', 'publishDate', 'sapo', 'sourceNews', 'tags', 'title', 'url' ] news_df_from_mysql = news_df_from_mysql[news_df_from_mysql["sourceNews"] == "CafeBiz"] news_df_from_mysql = news_df_from_mysql.dropna() # remove news deleted news_df_from_mysql = news_df_from_mysql[news_df_from_mysql["is_deleted"] == 0] news_df_from_mysql = news_df_from_mysql[list_cul] if news_df_from_mysql.empty: # if empty isEmpty = 1 return isEmpty news_df_handle_by_spark = handle_database_news_by_spark( news_df_from_mysql, spark) news_df = remove_duplicate_newsid(news_df_handle_by_spark, acr_label_encoders) if news_df.empty: # if empty isEmpty = 1 return isEmpty # fill 0 for NaN values entities (person, location) # news_df.fillna("0", inplace=True) news_df = custome_df(news_df) print("Saving news articles csv original CSV to ") write_articale_csv_original(news_df, output_articles_csv_path_original) if len(os.listdir(path_pickle)) == 0: # empty first time print("File Chua Da Ton Tai") print('Encoding categorical features') cat_features_encoders, labels_class_weights = process_cat_features( news_df) # write_dict_newsid_encode(cat_features_encoders["article_id"]) print('Exporting LabelEncoders of categorical features: {}'.format( output_label_encoders)) save_article_cat_encoders( output_label_encoders + "acr_label_encoders.pickle", cat_features_encoders, labels_class_weights) print("Saving news articles CSV to {}".format( output_articles_csv_path_preprocessed)) # news_df.to_csv( output_articles_csv_path_preprocessed +"cafebiz_articles.csv", index=False) news_df.to_csv(output_articles_csv_path_preprocessed + "cafebiz_articles.csv", index=False) print('Tokenizing articles...') tokenized_articles = tokenize_articles( news_df['text_highlights'].values, tokenization_fn=get_tkn_fn(max_words_length)) print('Computing word frequencies...') words_freq = get_words_freq(tokenized_articles) print( "Loading word2vec model and extracting words of this corpus' vocabulary..." ) w2v_model = Singleton.getInstance(input_word_embeddings_path) word_vocab, word_embeddings_matrix = process_word_embedding_for_corpus_vocab( w2v_model, words_freq, vocab_most_freq_words) print('Saving word embeddings and vocab.: {}'.format( output_word_vocab_embeddings_path)) save_word_vocab_embeddings( output_word_vocab_embeddings_path + "acr_word_vocab_embeddings.pickle", word_vocab, word_embeddings_matrix) print('Converting tokens to int numbers (according to the vocab.)...') texts_int, texts_lengths = convert_tokens_to_int( tokenized_articles, word_vocab) news_df['text_length'] = texts_lengths news_df['text_int'] = texts_int data_to_export_df = news_df[[ 'id', 'url', # For debug 'id_encoded', 'category0_encoded', # 'category1_encoded', 'keywords_encoded', # 'author_encoded', # 'concepts_encoded', # 'entities_encoded', 'locations_encoded', 'persons_encoded', 'created_at_ts', 'text_length', 'text_int' ]] print("Category 0:", news_df["category0_encoded"].unique()) for k, v in labels_class_weights.items(): print("Label class weight shape:", k, ":", v.shape) print('Exporting tokenized articles to TFRecords: {}'.format( path_tf_record)) export_dataframe_to_tf_records(data_to_export_df, make_sequence_example, output_path=output_tf_records_path, examples_by_file=articles_by_tfrecord) else: # not empty run more than one time print("File Da Ton Tai") print("Database have new article") print("Call singelton ACR content: ") #Load ACR content # #1 # from pick_singleton.pick_singleton import ACR_Pickle_Singleton # acr_content = ACR_Pickle_Singleton.getInstance() # acr_label_encoders = acr_content.acr_label_encoders #2 # dict_news_id_encode = load_dict_news_id_encode() # word_vocab, word_embeddings_matrix = load_acr_preprocessing_word_embedding(get_file_max_date(get_all_file( output_word_vocab_embeddings_path))) print('Encoding categorical features') cat_features_encoders, labels_class_weights = process_cat_features_second_time( news_df, acr_label_encoders) # append_dict_newsid_encode(cat_features_encoders["article_id"]) # write_dict_newsid_encode(cat_features_encoders["article_id"]) # print('Exporting LabelEncoders of categorical features: {}'.format( output_label_encoders)) # save_article_cat_encoders( output_label_encoders +"acr_label_encoders.pickle", cat_features_encoders, # labels_class_weights) print("Saving news articles CSV to {}".format( output_articles_csv_path_preprocessed)) path_csv = get_all_file(output_articles_csv_path_preprocessed)[0] df = pd.read_csv(path_csv) frames = [df, news_df] result = pd.concat(frames, ignore_index=True) result.to_csv(output_articles_csv_path_preprocessed + "cafebiz_articles.csv", index=False) print('Tokenizing articles...') tokenized_articles = tokenize_articles( news_df['text_highlights'].values, tokenization_fn=get_tkn_fn(max_words_length)) print('Computing word frequencies...') words_freq = get_words_freq(tokenized_articles) print( "Loading word2vec model and extracting words of this corpus' vocabulary..." ) w2v_model = Singleton.getInstance(input_word_embeddings_path) word_vocab, word_embeddings_matrix = process_word_embedding_for_corpus_vocab( w2v_model, words_freq, vocab_most_freq_words) print('Saving word embeddings and vocab.: {}'.format( output_word_vocab_embeddings_path)) # save_word_vocab_embeddings( output_word_vocab_embeddings_path +"acr_word_vocab_embeddings.pickle", # word_vocab, word_embeddings_matrix) print('Converting tokens to int numbers (according to the vocab.)...') texts_int, texts_lengths = convert_tokens_to_int_second_time( tokenized_articles, word_vocab) news_df['text_length'] = texts_lengths news_df['text_int'] = texts_int data_to_export_df = news_df[[ 'id', 'url', # For debug 'id_encoded', 'category0_encoded', # 'category1_encoded', 'keywords_encoded', # 'author_encoded', # 'concepts_encoded', # 'entities_encoded', 'locations_encoded', 'persons_encoded', 'created_at_ts', 'text_length', 'text_int' ]] print("Category 0:", news_df["category0_encoded"].unique()) for k, v in labels_class_weights.items(): print("Label class weight shape:", k, ":", v.shape) print('Exporting tokenized articles to TFRecords: {}'.format( path_tf_record)) print("len data_to_export_df : {}".format(len(data_to_export_df))) export_dataframe_to_tf_records(data_to_export_df, make_sequence_example, output_path=output_tf_records_path, examples_by_file=articles_by_tfrecord) print("<=== END ARC PREPROCESS ===>") return isEmpty
def main(): """Main function""" # Get args args = get_args() # Azure credentials sas_token = args.sas storage_account_name = args.storage container_in = args.container_in container_out = args.container_out azure_accounts = list() azure_accounts.append({ "storage": storage_account_name, "sas": sas_token, "container": container_in }) azure_accounts.append({ "storage": storage_account_name, "sas": sas_token, "container": container_out }) # VM cores = args.vm_cores ram = args.vm_ram shuffle_partitions = args.shuffle_partitions # Date, country, prefix country = args.country date_string = args.date # Set date variables day_time = datetime.strptime(date_string, "%Y-%m-%d") year = day_time.year month = day_time.month day = day_time.day dayofyear = day_time.timetuple().tm_yday # config accuracy = args.accuracy # Path in - path out blob_in = f"wasbs://{container_in}@{storage_account_name}.blob.core.windows.net/preprocessed/{country}/" path_out = f"users_bin_hours3-v{VERSION}/{country}" # config spark conf = getSparkConfig(cores, ram, shuffle_partitions, azure_accounts) # set prop for handling partition columns as strings (fixes prefixes as int) conf.set("spark.sql.sources.partitionColumnTypeInference.enabled", "false") # Create spark session sc = SparkContext(conf=conf).getOrCreate() sqlContext = SQLContext(sc) spark = sqlContext.sparkSession # Init azure client blob_service_client = BlobServiceClient.from_connection_string( CONN_STRING.format(storage_account_name, sas_token)) # build keys, date is mandatory partition_key = "year={}/month={}/day={}".format(year, month, day) out_key = "year={}/dayofyear={}".format(year, dayofyear) blob_base = "{}/{}".format(path_out, out_key) # process print("process " + partition_key + " to " + blob_base) start_time = time.time() local_dir = LOCAL_PATH + out_key print("write temp to " + local_dir) # cleanup local if exists if (os.path.isdir(local_dir)): map(os.unlink, (os.path.join(local_dir, f) for f in os.listdir(local_dir))) # TODO cleanup remote if exists # Input dataset print("read dataset table") read_time = time.time() # dfs = spark.read.format("parquet").load(blob_in) # # apply partition filter # dfs_partition = dfs.where( # f"(year = {year} AND month = {month} AND day = {day} AND prefix = '{prefix}')") # read only partition to reduce browse time dfs_cur_partition = spark.read.format("parquet").load( f"{blob_in}/{partition_key}") spark_time = time.time() dfs_cur_partition = dfs_cur_partition.where((col('accuracy') <= accuracy) & (col('accuracy') >= 0)) dfs_cur_partition = dfs_cur_partition.withColumn('hour', F.hour('timestamp')) result_df = dfs_cur_partition.groupBy('userId').agg( F.countDistinct("hour").cast('int').alias('num_hours')) # rebuild prefix result_df = result_df.withColumn('prefix', result_df.userId.substr(1, 2)) # lit partition columns in output result_df = result_df.withColumn('year', F.lit(year)) #result_df = result_df.withColumn('month', F.lit(month)) #result_df = result_df.withColumn('day', F.lit(day)) #result_df = result_df.withColumn('dayofyear', F.lit(dayofyear)) # write as single partition result_df.repartition(1).write.partitionBy("prefix").format( 'parquet').mode("overwrite").save(local_dir + "/") # stats - enable only for debug! # num_records = result_df.count() # print(f"written {num_records} rows to "+local_dir) # if num_records == 0: # raise Exception("Zero rows output") print("upload local data to azure") upload_time = time.time() # upload parts over states for fprefix in enumerate_prefixes(): print(f"upload files for {fprefix}") prefix_dir = local_dir + "/prefix=" + fprefix prefix_key = f"{out_key}/prefix={fprefix}/" prefix_blob = f"{blob_base}/prefix={fprefix}" if (os.path.isdir(prefix_dir)): files = [ filename for filename in os.listdir(prefix_dir) if filename.startswith("part-") ] if len(files) > 0: for file_local in files: file_path = prefix_dir + "/" + file_local part_num = int(file_local.split('-')[1]) part_key = '{:05d}'.format(part_num) # fix name as static hash to be reproducible filename_hash = hashlib.sha1( str.encode(prefix_key + part_key)).hexdigest() blob_key = "{}/part-{}-{}.snappy.parquet".format( prefix_blob, part_key, filename_hash) print("upload " + file_path + " to " + container_out + ":" + blob_key) blob_client = blob_service_client.get_blob_client( container_out, blob_key) with open(file_path, "rb") as data: blob_client.upload_blob(data, overwrite=True) # cleanup os.remove(file_path) else: print(f"no files to upload for {fprefix}") else: print(f"missing partition for {fprefix}") print("--- {} seconds elapsed ---".format(int(time.time() - start_time))) print() shutdown_time = time.time() spark.stop() end_time = time.time() print("Done in {} seconds (read:{} spark:{} upload:{} shutdown:{})".format( int(end_time - start_time), int(spark_time - read_time), int(upload_time - spark_time), int(shutdown_time - upload_time), int(end_time - shutdown_time))) print('Done.')
conf.set("spark.executor.memory", "4g") # conf.set("spark.sql.shuffle.partitions", "400") # conf.set("spark.yarn.executor.memoryOverhead", "256m") conf.set("spark.network.timeout", "2000") conf.set("spark.sql.broadcastTimeout", "300000") # conf.set("spark.dynamicAllocation.enabled","true") # conf.set("spark.shuffle.service.enabled", "true") # conf.set("spark.local.dir", "/yelp-dataset/spark-tmp") conf.set("spark.driver.memory", "1g") # conf.set("spark.driver.maxResultSize","10g") # sc = SparkContext("local[*]", "Simple App", conf=conf) # sc.setCheckpointDir('/tmp') os.environ['PYSPARK_PYTHON'] = '/usr/bin/python3.6' conf.setMaster(SPARK_URL) sc = SparkContext(conf=conf) sql_sc = SQLContext(sc) # In[5]: # users.persist() # In[6]: # + cat_features + geo_features # train_df = train_df.select(["user_id", "user_id_2"] + feature_columns + other_columns) # assembler = VectorAssembler(inputCols=feature_columns, outputCol="input_features") # features_df = assembler.transform(train_df) # scaler = StandardScaler(inputCol="input_features", outputCol="scaled_features", # withStd=True, withMean=True)
rdd_parser = sub_parsers.add_parser('rdd') rdd_parser.set_defaults(func=load_to_rdd) rdd_parser.add_argument( '--num', dest='num', type=int, default=5, help='Number of values to print', ) rdd_parser.add_argument('--skip-header', dest='skipheader', choices=['Y', 'N'], default='Y') df_parser = sub_parsers.add_parser('df') df_parser.set_defaults(func=load_to_df) return parser parser = config_parser() if __name__ == '__main__': args = parser.parse_args() with SparkContext(appName='Load Users From CSV') as sc: sqc = SQLContext(sc) args.func(args, sc, sqc)
from pyspark import SparkContext, SQLContext from pyspark.sql import functions as F spark = SparkContext("local[*]", "SQL_Example") sc = SQLContext(spark) df = sc.read.load("hdfs://my-hadoop-cluster-hadoop-hdfs-nn:9000/data/dataset.csv", format="csv", sep=",", inferSchema="true", header="true") sum = df.groupBy('artist').agg( F.sum('timesListened').alias('timesListened')) \ .orderBy('timesListened', ascending=True) sum.show() sum.write.option("truncate", "true").format("jdbc") \ .option("url", "jdbc:mysql://my-database-service:3306/spotifydb") \ .option("dbtable", "live_spotify") \ .option("user", "root") \ .option("password", "mysecretpw") \ .mode("overwrite") \ .save()
raise if __name__ == "__main__": # execute only if run as a script try: SUBMIT_ARGS = ''' --master local[*] --driver-memory 2g --packages mysql:mysql-connector-java:5.1.46 pyspark-shell ''' os.environ["PYSPARK_SUBMIT_ARGS"] = SUBMIT_ARGS conf = SparkConf() sc = SparkContext(conf=conf) sql_c = SQLContext(sc) sc.setLogLevel("WARN") spark = sql_c.sparkSession logging.info("Spark Application is up") except Exception as e: logging.error(str(e)) raise #FIRST AIRFLOW STAGE try: #In productioun these files would be moved using the aws Boto3 library to enable the ability to move data in S3 buckets mkdir_p(os.getcwd() + "/the-movies-dataset-internal/movies_metadata/year=" + str(datetime.date.today().year) + "/month=" + str(datetime.date.today().month))
from pyspark import SparkContext, SparkConf, SQLContext import sys if __name__ == '__main__': conf = SparkConf().setAppName("Natalia Martir - Practica 4") Sp = SparkContext(conf=conf) headers = Sp.textFile( "/user/datasets/ecbdl14/ECBDL14_IR2.header").collect() headers = list(filter(lambda x: "@inputs" in x, headers))[0] headers = headers.replace(",", "").strip().split() del headers[0] headers.append("class") sqlc = SQLContext(Sp) df = sqlc.read.csv('/user/datasets/ecbdl14/ECBDL14_IR2.data', header=False, inferSchema=True) for i, colname in enumerate(df.columns): df = df.withColumnRenamed(colname, headers[i]) df = df.select("PSSM_r1_0_A", "PSSM_r2_-1_S", "PSSM_central_-1_G", "PSSM_r2_-1_Q", "PSSM_r1_3_E", "PSSM_r1_-1_E", "class") df.write.csv('./filteredC.small.training', header=True)
import os from gensim.models import Word2Vec # configure spark to be started with more allocated memory memory = '12g' pyspark_submit_args = ' --driver-memory ' + memory + ' pyspark-shell' os.environ["PYSPARK_SUBMIT_ARGS"] = pyspark_submit_args from pyspark import SparkConf, SparkContext, SQLContext conf = (SparkConf().setMaster("local").setAppName("KarmaDSL").set( "spark.executor.cores", "8").set("spark.executor.memory", "1g")) sc = SparkContext(conf=conf) sql_context = SQLContext(sc) root_dir = os.path.abspath(os.path.join(os.path.realpath(__file__), '..')) data_dir = os.path.join(root_dir, "data/datasets") train_model_dir = os.path.join(root_dir, "data/train_models") # word2vec = Word2Vec.load_word2vec_format(os.path.join("/Users/minhpham/tools/", 'GoogleNews-vectors-negative300.bin'), binary=True) file_write = open('debug.txt', 'w') logger = sc._jvm.org.apache.log4j logger.LogManager.getLogger("org").setLevel(logger.Level.FATAL) logger.LogManager.getLogger("akka").setLevel(logger.Level.FATAL)
# Enable inline plotting # %matplotlib inline / not supported in databricks use display() instead import os # in case have to remove a file os.remove(<filenameS>) from time import gmtime, strftime showtime = strftime("%Y%m%d-%H%M%S", gmtime()) appName = "Chart Lab Diagnoses: " + showtime if True: # sc automatically set in databricks, set here for CDSW SparkConf().setMaster(value = "spark://ken-HP-EliteBook-8530w:7077") spark = SparkSession.builder.appName(appName).config(conf=SparkConf()).getOrCreate() sc = spark.sparkContext sqlctx = SQLContext(sc) if False: # snappy is the default compression sqlctx.setConf("spark.sql.parquet.compression.codec", "uncompressed") print(sc.master) print(spark.sparkContext.getConf) print('Python version ' + sys.version) # print('Pandas version ' + pd.__version__) print('Matplotlib version ' + matplotlib.__version__) # COMMAND ----------
def load_data(path): from pyspark.sql import SQLContext sqlContext = SQLContext(sc) data = sqlContext.read.parquet(path) return data
def load_model(): from pyspark.sql import SQLContext sqlContext = SQLContext(sc) lookup = sqlContext.read.parquet('/user/rmusters/2015model99/data').alias("lookup") lookup_bd = sc.broadcast(lookup.rdd.collectAsMap()) return lookup_bd
def save_dist(df_path, fname): from pyspark.sql import SQLContext sqlContext = SQLContext(sc) df = sqlContext.read.parquet(df_path) df.save(fname, "com.databricks.spark.csv", "overwrite")
def signalDbDataFrameFromSignalDbRDD(sparkContext, rdd): sqlContext = SQLContext(sparkContext) return sqlContext.createDataFrame(rdd, StructType(_structFieldArray()))
def main(self, sc, *args): sql_context = SQLContext(sc) df = self.read_from_hdfs(sql_context) self.write_to_db(df)
# import findspark # findspark.init() from pyspark import SparkContext from pyspark import SQLContext from pyspark import SparkConf # import os # os.environ['PYSPARK_SUBMIT_ARGS'] = "--master local[2] pyspark-shell" # os.environ['JAVA_HOME'] = "$(/usr/libexec/java_home -v 13)" #################### spark调用python文件: #################### # spark-submit i0spark-submit-demo.py sc = SparkContext("local","PySparkShell") spark = SQLContext(sc) lines = sc.textFile("i1introduction.md") c = lines.count() f = lines.first() print('#'*20) print(c, f)
from pyspark import SparkContext, SQLContext import matplotlib.pyplot as plt import pyspark.sql.functions as func plt.rcParams["figure.figsize"] = (20,10) import pandas as pd try: sc = SparkContext("local", "Simple App") except ValueError: pass # In[2]: sql_ctx = SQLContext(sc) # Reading the data # In[3]: train_data = sql_ctx.read.csv('/home/ravibisla/PycharmProjects/DataScience/train_rating.txt', header=True) train_data.registerTempTable('train_data') train_data.cache() test_data = sql_ctx.read.csv('/home/ravibisla/PycharmProjects/DataScience/test_rating.txt', header=True) test_data.registerTempTable('test_data') test_data.cache() # train_review = sql_ctx.read.json('/home/ravibisla/PycharmProjects/DataScience/train_review.json') # train_review.registerTempTable('train_review')
# coding: utf-8 # ## Teste de Pyspark # In[4]: from pyspark.sql.types import * from pyspark import SQLContext, HiveContext, SparkContext # In[9]: sc = SparkContext() # In[10]: sqlcontext = SQLContext(sc) # In[11]: hive_context = HiveContext(sc) # In[12]: rdd = sc.parallelize(range(1000)) # In[13]: rdd.takeSample(False, 5) # In[1]:
from pyspark import SparkContext, SQLContext sc = SparkContext(appName='emrtest') sqlCtx = SQLContext(sc) print sc rdd = sc.parallelize(['1', '2', '3']) print 'the count in rdd is ', rdd.count() rdd.repartition(1).saveAsTextFile( 'hdfs://ec2-52-87-197-156.compute-1.amazonaws.com/tmp/count')
def main(): """Main function""" # Get args args = get_args() # container container_in = args.container_in container_out = args.container_out # Azure credentials sas_token = args.sas storage_account_name = args.storage azure_accounts = list() azure_accounts.append({ "storage": storage_account_name, "sas": sas_token, "container": container_in }) azure_accounts.append({ "storage": storage_account_name, "sas": sas_token, "container": container_out }) oauth_login = args.oauth_login oauth_client_id = args.oauth_client_id oauth_client_secret = args.oauth_client_secret # requires hadoop 3.2+ # azure_oauth = { # "endpoint": oauth_login, # "client-id": oauth_client_id, # "client-secret": oauth_client_secret # } azure_oauth = False # VM cores = args.vm_cores ram = args.vm_ram shuffle_partitions = args.shuffle_partitions # Date, prefix country = args.country prefix = args.prefix # process config roam_dist_stops = args.roam_dist_stops roam_dist_events = args.roam_dist_events # Path in - path out blob_in = f"wasbs://{container_in}@{storage_account_name}.blob.core.windows.net/stoplocation-v8_prefix_r70-s5-a70-h6/{country}/" timezones_in = f"wasbs://cuebiq-data@{storage_account_name}.blob.core.windows.net/utils_states_timezones/" if azure_oauth: # we can leverage abfss blob_in = f"abfss://{container_in}@{storage_account_name}.dfs.core.windows.net/stoplocation-v8_prefix_r70-s5-a70-h6/country={country}/" timezones_in = f"abfss://cuebiq-data@{storage_account_name}.dfs.core.windows.net/utils_states_timezones/" path_out_distinct = f"distinct_user_clusters-v8_r70-s5-a70-h6_clustered_{roam_dist_stops}m_v{VERSION}/country={country}" path_out_all = f"all_user_clusters-v8_r70-s5-a70-h6_clustered_{roam_dist_stops}m_v{VERSION}/country={country}" # config spark conf = getSparkConfig(cores, ram, shuffle_partitions, azure_accounts, azure_oauth) # set prop for handling partition columns as strings (fixes prefixes as int) conf.set("spark.sql.sources.partitionColumnTypeInference.enabled", "false") # Create spark session sc = SparkContext(conf=conf).getOrCreate() sqlContext = SQLContext(sc) spark = sqlContext.sparkSession # register UDF from jar spark.udf.registerJavaFunction( "geohash", "it.smartcommunitylab.sco.mobilitycovid.udf.GeohashEncode") # Init azure client blob_service_client = BlobServiceClient.from_connection_string( CONN_STRING.format(storage_account_name, sas_token)) # build keys, date is mandatory, prefix opt partition_key = f"prefix={prefix}" print("process " + partition_key) start_time = time.time() local_dir = LOCAL_PATH + partition_key print("write temp to " + local_dir) # cleanup local if exists if (os.path.isdir(local_dir)): map(os.unlink, (os.path.join(local_dir, f) for f in os.listdir(local_dir))) # Input dataset print("read dataset table") read_time = time.time() # explode days manually dates = [datetime(2020, 1, 1) + timedelta(days=x) for x in range(0, 258)] blobs_in = [ "{}/year={}/month={}/day={}/prefix={}".format(blob_in, d.year, d.month, d.day, prefix) for d in dates ] #dfs = spark.read.format("parquet").load(*blobs_in) dfs = read_multiple_df(spark, blobs_in) dfs_timezones = spark.read.format("parquet").load(timezones_in) # manually inject prefix column dfs = dfs.withColumn("prefix", F.lit(prefix)) # apply partition filter dfs_state = dfs.where(f"prefix = '{prefix}'") print("processing with spark") spark_time = time.time() w = Window().partitionBy('userId').orderBy('begin') dfs_state = add_distance_column(dfs_state, order_column='begin') dfs_state = dfs_state.fillna(0, subset=['next_travelled_distance']) dfs_state = dfs_state.withColumn( 'lag_next_travelled_distance', F.lag(col('next_travelled_distance')).over(w)) dfs_state = dfs_state.withColumn('lag_end', F.lag('end').over(w)) dfs_state = dfs_state.withColumn( 'rn', F.when( ((col('lag_next_travelled_distance') != col('prev_travelled_distance')) | (col('prev_travelled_distance') > 0) | (col('lag_next_travelled_distance') > 0) | (col('distance_prev') > roam_dist_events) | ((F.dayofyear(col('begin')) - F.dayofyear(col('lag_end')) == 1) & (F.hour(col('begin')) < 6))) & ((col('lag_end').isNull()) | (col('lag_end') < col('begin'))), 1).otherwise(0)) # Remove prev_travelled distance when rn == 0 (it happens when lag_end and begin overlap) dfs_state = dfs_state.withColumn( 'prev_travelled_distance', F.when(col('rn') == 0, 0).otherwise(col('prev_travelled_distance'))) w = Window().partitionBy('userId').orderBy('begin').rangeBetween( Window.unboundedPreceding, 0) dfs_state = dfs_state.withColumn('group', F.sum('rn').over(w)) dfs_state = dfs_state.groupBy('userId', 'group', 'state').agg( F.mean('latitude').alias('latitude'), F.mean('longitude').alias('longitude'), F.min('begin').alias('begin'), F.max('end').alias('end')).drop('group') dfs_destinations = get_destinations(dfs_state, roam_dist=roam_dist_stops) dfs_destinations = dfs_destinations.withColumn( 'prefix', dfs_destinations.userId.substr(1, 2)) dfs_destinations = dfs_destinations.withColumn('dayofyear', F.dayofyear('begin')) dfs_destinations = dfs_destinations.withColumn('year', F.year('begin')) # dfs_destinations = dfs_destinations.withColumn('state', F.lit(state)) # Local time dfs_destinations.createOrReplaceTempView("dfs_destinations") dfs_destinations = spark.sql(""" SELECT dfs_destinations.*, geohash(clusterLatitude, clusterLongitude, 7) as geohash7 from dfs_destinations """) dfs_destinations = dfs_destinations.withColumn( 'geohash5', F.substring(col('geohash7'), 1, 5)) dfs_destinations = dfs_destinations.join(F.broadcast(dfs_timezones), on='geohash5').drop('geohash5') dfs_destinations = dfs_destinations.withColumn( 'local_begin', F.from_utc_timestamp(col('begin'), col('tzid'))) dfs_destinations = dfs_destinations.withColumn( 'offset', ((col('local_begin').cast('long') - col('begin').cast('long')) / 3600).cast('int')).drop('local_begin') dfs_destinations.persist(StorageLevel.DISK_ONLY) # Write # output as country/prefix/part1..N local_dir_all = local_dir + "/all/" dfs_destinations_all = dfs_destinations.select('prefix', 'userId', 'clusterId', 'begin', 'end', 'offset', 'year', 'dayofyear') dfs_destinations_all.repartition(8, 'dayofyear').write.format( 'parquet').mode('overwrite').save(local_dir_all + "prefix=" + prefix + "/") # output as country/prefix/state local_dir_distinct = local_dir + "/distinct/" dfs_destinations_distinct = dfs_destinations.select( 'prefix', 'userId', 'clusterId', 'clusterLatitude', 'clusterLongitude', 'geohash7', 'state').distinct() dfs_destinations_distinct.repartition("state").write.partitionBy( "state").format('parquet').mode('overwrite').save(local_dir_distinct + "prefix=" + prefix + "/") dfs_destinations.unpersist() print("upload local data to azure") upload_time = time.time() # upload parts 1 "prefix/state" print(f"upload files for distinct") # upload with threads dfutures = [] with ThreadPoolExecutor(max_workers=THREADS) as executor: fprefix = prefix print(f"upload files for distinct: {fprefix}") prefix_dir = local_dir_distinct + "prefix=" + fprefix prefix_key = f"prefix={fprefix}" for state in US_STATES: s_key = f"state={state}" f_dir = prefix_dir + "/" + s_key f_key = prefix_key + "/" + s_key # print(f"read files for distinct from {f_dir}") if (os.path.isdir(f_dir)): files = [ filename for filename in os.listdir(f_dir) if filename.startswith("part-") ] if len(files) > 0: for file_local in files: file_path = f_dir + "/" + file_local part_num = int(file_local.split('-')[1]) part_key = '{:05d}'.format(part_num) # fix name as static hash to be reproducible filename_hash = hashlib.sha1( str.encode(f_key + f_key + part_key)).hexdigest() blob_key = "{}/{}/part-{}-{}.snappy.parquet".format( path_out_distinct, f_key, part_key, filename_hash) # print("upload " + file_path + " to " + container_out+":"+blob_key) # upload_blob(blob_service_client,container_out, blob_key, file_path) future = executor.submit(upload_blob, blob_service_client, container_out, blob_key, file_path) dfutures.append(future) # else: # print(f"no files to upload for {f_key}") # else: # print(f"missing partition for {f_key}") # end of loop, wait for futures for future in dfutures: bkey = future.result() # ensure we wait all tasks # TODO check if all done ddone = concurrent.futures.wait(dfutures) # upload parts 2 "prefix/parts" print(f"upload files for all") fprefix = prefix # upload with threads afutures = [] with ThreadPoolExecutor(max_workers=THREADS) as executor: print(f"upload files for all: {fprefix}") prefix_dir = local_dir_all + "prefix=" + fprefix prefix_key = f"prefix={fprefix}" if (os.path.isdir(prefix_dir)): files = [ filename for filename in os.listdir(prefix_dir) if filename.startswith("part-") ] if len(files) > 0: for file_local in files: file_path = prefix_dir + "/" + file_local part_num = int(file_local.split('-')[1]) part_key = '{:05d}'.format(part_num) # fix name as static hash to be reproducible filename_hash = hashlib.sha1( str.encode(prefix_key + part_key)).hexdigest() blob_key = "{}/{}/part-{}-{}.snappy.parquet".format( path_out_all, prefix_key, part_key, filename_hash) # print("upload " + file_path + " to " + container_out+":"+blob_key) # upload_blob(blob_service_client,container_out, blob_key, file_path) future = executor.submit(upload_blob, blob_service_client, container_out, blob_key, file_path) afutures.append(future) # else: # print(f"no files to upload for {d_key}") # else: # print(f"missing partition for {d_key}") # end of loop, wait for futures for future in afutures: bkey = future.result() # ensure we wait all tasks # TODO check if all done adone = concurrent.futures.wait(afutures) print("--- {} seconds elapsed ---".format(int(time.time() - start_time))) print() shutdown_time = time.time() spark.stop() end_time = time.time() print("Done in {} seconds (read:{} spark:{} upload:{} shutdown:{})".format( int(end_time - start_time), int(spark_time - read_time), int(upload_time - spark_time), int(shutdown_time - upload_time), int(end_time - shutdown_time))) print('Done.')
from pyspark import SparkConf, SparkContext, SQLContext conf = SparkConf().setMaster('local').setAppName('ALS') sc = SparkContext(conf=conf) sq = SQLContext(sc) path = r'hdfs://localhost:9000//input/python_spark_hadoop/u.data' rawUserData = sc.textFile(path) rawRatings = rawUserData.map(lambda line: line.split('\t')[:3]) ratingsRDD = rawRatings.map(lambda x: (x[0], x[1], x[2])) ratingsRDD.take(5) # 投票数 numRatings = ratingsRDD.count() # 用户数 numUsers = ratingsRDD.map(lambda x: x[0]).distinct().count() # 电影数 numMovies = ratingsRDD.map(lambda x: x[1]).distinct().count() from pyspark.mllib.recommendation import ALS model = ALS.train(ratingsRDD, 10, 10, 0.01) print(model) model.recommendProducts(100, 5) model.predict(100, 1643) model.recommendUsers(product=200, num=5) try: model.save(sc, r'file:/home/xiligey/Study/Spark/PythonSparkHadoop/ALSmodel') except Exception as e: print(e) print('Model已经存在, 请先删除再存储或者更换目录') from pyspark.mllib.recommendation import MatrixFactorizationModel try:
from pyspark import SQLContext, SparkContext from pyspark.sql import SparkSession from pyspark.sql.window import Window from pyspark.sql import functions as func import sys sc = SparkContext() sql = SQLContext(sc) spark = SparkSession(sc) # read product file products = spark.read.csv('../data/products.csv', header=True) print(products.show(3)) window_spec1 = Window.partitionBy(products['category'])\ .orderBy(products['price'].desc()) price_rank = func.rank().over(window_spec1) product_rank = products.select( products['product'], products['category'], products['price'], ).withColumn('rank', price_rank) print(product_rank.show()) # row bbetween -1 and 1 window_spec2 = Window.partitionBy(products['category'])\ .orderBy(products['price'].desc())\ .rowsBetween(-1,1)
from pyspark import SparkConf, SparkContext, SQLContext, Row from pyspark.sql import functions as F APP_NAME = "count unique vistor of everyday" def main(sc, sqlC): # 模拟数据 userAccessLog = [ "2017-01-01,1122", "2017-01-01,1122", "2017-01-01,1123", "2017-01-01,1124", "2017-01-01,1124", "2017-01-02,1122", "2017-01-01,1121", "2017-01-01,1123", "2017-01-01,1123" ] accessLogRDD = sc.parallelize(userAccessLog) RowRDD = accessLogRDD.map( lambda e: Row(e.split(",")[0], int(e.split(",")[1]))) df = sqlC.createDataFrame(RowRDD, ['date', 'userid']) df.show() df.printSchema() df.groupBy('date').agg(F.countDistinct(df.userid)).show() if __name__ == "__main__": conf = SparkConf().setAppName(APP_NAME).setMaster("local[*]") sc = SparkContext(conf=conf) sqlC = SQLContext(sc) main(sc, sqlC)
sys.stderr.write( "Example-1: " + exeName + " 10 hdfs:///perfdata/freebasedeletions/* \n") sys.stderr.write( "Example-2: " + exeName + " 1 hdfs:///perf/data/deletions/deletions.csv-00000-of-00020\n" ) exit(-1) print( str(datetime.now()) + " runCount = " + str(sys.argv[1]) + ", data = " + str(sys.argv[2])) from pyspark import SparkContext, SQLContext beginTime = time.time() sparkContext = SparkContext() sqlContext = SQLContext(sparkContext) PerfBenchmark.RunPerfSuite(FreebaseDeletionsBenchmark, sys.argv, sparkContext, sqlContext) sparkContext.stop() PerfBenchmark.ReportResult() print( str(datetime.now()) + " " + os.path.basename(__file__) + " : Finished python version benchmark test. Whole time = " + ("%.3f" % (time.time() - beginTime)) + " s.")
def get_sql_context_instance(spark_context): if ('sqlContextSingletonInstance' not in globals()): globals()['sqlContextSingletonInstance'] = SQLContext(spark_context) return globals()['sqlContextSingletonInstance']
from pyspark import SparkConf, SparkContext, SQLContext from ts.flint import FlintContext, summarizers conf = SparkConf().setMaster("local").setAppName("TimeSeries") sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) flintContext = FlintContext(sqlContext) df = spark.createDataFrame([('2018-08-20', 1.0), ('2018-08-21', 2.0), ('2018-08-24', 3.0)], ['time', 'v']).withColumn( 'time', from_utc_timestamp(col('time'), 'UTC')) # Convert to Flint DataFrame flint_df = flintContext.read.dataframe(df) # Use Spark DataFrame functionality flint_df = flint_df.withColumn('v', flint_df['v'] + 1) # Use Flint functionality flint_df = flint_df.summarizeCycles(summarizers.count())
def main(): conf = SparkConf().setAppName('housingprice') sc = SparkContext(conf=conf) sqlContext = SQLContext(sc) taxreportSchema = StructType([ StructField('PID', StringType(), False), StructField('Legal_Type', StringType(), False), StructField('FOLIO', StringType(), False), StructField('Coordinates', StringType(), True), StructField('ZoneName', StringType(), True), StructField('ZoneCat', StringType(), True), StructField('LOT', StringType(), True), StructField('Block', StringType(), True), StructField('plan', StringType(), True), StructField('DisLot', StringType(), True), StructField('FCiviNum', StringType(), True), StructField('TCiviNum', StringType(), True), StructField('StreetName', StringType(), True), StructField('PostalCode', StringType(), True), StructField('NLegalName1', StringType(), True), StructField('NLegalName2', StringType(), True), StructField('NLegalName3', StringType(), True), StructField('NLegalName4', StringType(), True), StructField('NLegalName5', StringType(), True), StructField('CurVal', StringType(), True), StructField('CurImpVal', StringType(), True), StructField('Taxassess', StringType(), True), StructField('prevVal', StringType(), True), StructField('prevImpVal', StringType(), True), StructField('YearBuilt', StringType(), True), StructField('BigImpYear', StringType(), True), StructField('Tax_levy', StringType(), True), StructField('NeighbourhoodCode', StringType(), True), ]) conversionSchema = StructType([ StructField('date', StringType(), False), StructField('USD', StringType(), False), StructField('rate', StringType(), False), StructField('reciprate', StringType(), False), ]) crudeoilSchema = StructType([ StructField('date', DateType(), False), StructField('oilprice', StringType(), False), ]) def fixdate(convVal): a = convVal.split(" ") dates = a[0].split("/") alldate = "20"+dates[2]+'/'+dates[0] return (alldate,a[1]) def filterYear(dates): a = dates.split('/') if (a[1]=='2016'): return False else: return True def processDate(df): def splitMonth(cols): a = cols.split('/') return a[1] def splitYear(cols): a = cols.split('/') return a[0] fUDF = udf(splitMonth, StringType()) df1 = df.withColumn("month", fUDF('year')) fUDFyear = udf(splitYear, StringType()) return df1.withColumn("year", fUDFyear('year')) #Reading the Tax Report Dataset taxreportinfo = sqlContext.read.format('com.databricks.spark.csv').options(header='true').schema(taxreportSchema).load(inputs+"taxreport/test") taxreportinfo.registerTempTable("taxreport") #Selecting the price,TaxAssessment Year and Postalcode of each property propertyVal = sqlContext.sql("SELECT CurVal, Taxassess, PostalCode FROM taxreport") propertyVal.registerTempTable("propertyVal") #Reading the CAN to USD conversion dataset conversion = sqlContext.read.format('com.databricks.spark.csv').options(header='true').schema(conversionSchema).load(inputs+"conversion") conversion.registerTempTable("Conversion") #Selecting only the date and rate conversionrate = sqlContext.sql("SELECT date,rate FROM Conversion WHERE rate regexp '^[0-9]+'") conversionRDD = conversionrate.repartition(40).rdd.map(lambda w: (w.date+" "+w.rate)) conversiondates = conversionRDD.map(fixdate).filter(lambda (w,x):filterYear(w)).map(lambda l: Row(date=l[0], rate=l[1])) schemaConv = sqlContext.inferSchema(conversiondates) schemaConv.registerTempTable("ConversionDate") ConverDF = sqlContext.sql(" SELECT date,CAST(AVG(rate) AS DECIMAL(4,2)) as conversionrate FROM ConversionDate WHERE rate IS NOT NULL GROUP BY date") ConverDF.cache() #Reading the Canada Crude oil price dataset crudeoil = sc.textFile(inputs+"crudeoil") crudeoilRDD = crudeoil.map(lambda l: l.split()).map(lambda l: Row(date=l[0], oilprice=l[1])) crudeoilDF = sqlContext.inferSchema(crudeoilRDD) crudeoilDF.registerTempTable("crudeoil") #Selecting the date on M/Y format and oilprice oilprice = sqlContext.sql("SELECT DATE_FORMAT(date,'Y/M') as date,oilprice FROM crudeoil") oilprice.registerTempTable('oilprice') #Reading the interestrate of BC Dataset interestRate = sqlContext.read.format('com.databricks.spark.csv').options(header='true').load(inputs+"interestrate") interestRate.registerTempTable("interest") #Selecting the date and 5-year fixed mortgage price from the dataset interestDF = sqlContext.sql("SELECT DATE_FORMAT(date,'Y/M') as date,CAST(`5y-fixed-posted` AS DECIMAL(4,2)) AS interestrate FROM interest WHERE date >='2006-01' AND date <= '2015-12'") interestDF.registerTempTable("allrates") #Getting the average of each month on days whose value is not null. avgInterest = sqlContext.sql(" SELECT date,AVG(interestrate) as interestrates FROM allrates WHERE interestrate IS NOT NULL GROUP BY date") avgInterest.cache() joinedTable = avgInterest.join(oilprice,(avgInterest['date']==oilprice['date'])).select(avgInterest['date'],avgInterest['interestrates'],oilprice['oilprice']) JoinedConversion = joinedTable.join(ConverDF,(joinedTable['date']==ConverDF['date'])).select(joinedTable['date'].alias('year'),joinedTable['interestrates'],joinedTable['oilprice'],ConverDF['conversionrate']) JoinedConversion.registerTempTable("joinedConversion") ls = processDate(JoinedConversion) ls.show()