def join_clusters_with_lda_data(): from pyspark import SparkContext, SparkConf from pyspark.sql import SQLContext, Row from pyspark.sql.functions import monotonicallyIncreasingId conf = (SparkConf() .set("spark.driver.maxResultSize", "20g")\ .set("spark.rpc.askTimeout", "120000")\ .set("spark.executor.heartbeatInterval", "120000")) sc = SparkContext(appName='join_clusters_with_lda_data', conf=conf) sqlContext = SQLContext(sc) data_path = "/user/rmusters/lda_doc_topic" cluster_path = "/user/rmusters/lda_clusters" data = sqlContext.read.parquet(data_path) data.show() data = data.withColumn("id", monotonicallyIncreasingId()) cluster = sqlContext.read.parquet( cluster_path ) # .sample(False, 0.00001)#.map(lambda x: Row(cluster=x[0])) cluster.show() cluster = cluster.withColumn("id", monotonicallyIncreasingId()) result = data.join(cluster, on="id") result = result.drop("id") result = result.withColumnRenamed("_1", "id").withColumnRenamed("_2", "vectors") result.write.parquet("hdfs:///user/rmusters/bisecting_lda_data_cluster", mode="overwrite") result.show()
def write_data(path): import filter from pyspark.mllib.feature import Word2Vec, Word2VecModel # load data loc = '/user/rmusters/text/2015/01/*' text_file = sc.textFile(loc) data = text_file.map(lambda line: filter.filter(line).split(" ")) # load model word2vec = Word2Vec() model = Word2VecModel.load(sc, '/user/rmusters/2015model99') # get a tweet vector pair. from pyspark.sql import SQLContext sqlContext = SQLContext(sc) lookup = sqlContext.read.parquet('/user/rmusters/2015model99/data').alias("lookup") lookup_bd = sc.broadcast(lookup.rdd.collectAsMap()) vectors = data.map(lambda ws: [lookup_bd.value.get(w) for w in ws]) logger.info(vectors.count()) data = text_file.map(lambda line: (line, filter.filter(line).split(" ")))\ .map(lambda (text, filtered): (text, filtered, [lookup_bd.value.get(w) for w in filtered][0])) from pyspark.sql.functions import monotonicallyIncreasingId df = data.toDF(["text", "filtered_text", "vectors"]) # This will return a new DF with all the columns + id res = df.withColumn("id", monotonicallyIncreasingId()) res.write.parquet(path, mode="overwrite")
def generate_pk(p_col_pk, p_df): """ Args: p_col_pk - Priamry Key column name p_df - Data Frame for which to create PK column Returns: Spark DataFrame with new PK column """ dfPK = p_df.withColumn(p_col_pk, monotonicallyIncreasingId()+1) return dfPK
def canonical_format(df, Item): # 1. map to name, id, and manufacturer dictionary # 2. map to row number, line tuples # 3. collect to a map to provide easy lookup for training algorithm def to_dict(row): item = Item() item.populate(row) return item.to_dict() df = df.withColumn('id', monotonicallyIncreasingId()) return (df, df.map(lambda row: (row.id, to_dict(row))).collectAsMap())
def data_jan(): from pyspark.sql.functions import monotonicallyIncreasingId loc = '/user/rmusters/text/2015/01/*' text_file = sc.textFile(loc) data = text_file.map(lambda (text): (text, filter.filter(text))) data = data.toDF(["text", "filtered_text"]).withColumn("id", monotonicallyIncreasingId()) path = '/user/rmusters/data_jan' data.write.parquet(path, mode="overwrite")
lit(None).cast(StringType())) # read in readmission data measure_readmission = hive_context.table("default.measure_readmission") measure_readmission.registerTempTable("readmission_temp") measure_readmission_selected = hive_context.sql( " SELECT provider_id, measure_id,denominator,score,measure_start,measure_end FROM readmission_temp" ) # prepare readmissions for a union with effective measures: add empty column for sample measure_readmission_with_sample = measure_readmission_selected.withColumn( 'sample', lit(None).cast(StringType())) # add a column to flag the data as readmission measure_readmission_with_sample_care_type = measure_readmission_with_sample.withColumn( 'care_type', lit("readmission").cast(StringType())) readmission_ordered = measure_readmission_with_sample_care_type.select( "provider_id", "measure_id", "score", "measure_start", "measure_end", "sample", "denominator", "care_type") effective_ordered = measure_effective_with_care_type_denominator.select( "provider_id", "measure_id", "score", "measure_start", "measure_end", "sample", "denominator", "care_type") # merge timely and effective and readmitted measures procedure_temp = effective_ordered.unionAll(readmission_ordered) # add id procedure = procedure_temp.select( monotonicallyIncreasingId().alias("procedure_id"), "*") procedure.write.saveAsTable("procedure", mode="overwrite")
.load(os.path.realpath("Womens Clothing E-Commerce Reviews.csv")) reviews = data.map(lambda x : x['Review Text']).filter(lambda x: x is not None) tokens = reviews \ .map( lambda document: document.strip().lower()) \ .map( lambda document: re.split(" ", document)) \ .map( lambda word: [x for x in word if x.isalpha()]) \ .map( lambda word: [x for x in word if len(x) > 3] ) \ .map( lambda word: [x for x in word if x not in StopWords]) \ .zipWithIndex() row_rdd = rdd1.map(lambda x: Row(x)) df=sqlContext.createDataFrame(row_rdd,['numbers']).show() cv = CountVectorizer(inputCol="words", outputCol="features", vocabSize=3, minDF=2.0) model = cv.fit(df) result = model.transform(df) result.show(truncate=False) from pyspark.sql.functions import monotonicallyIncreasingId res = df.withColumn("id", monotonicallyIncreasingId()) df_txts = sqlContext.createDataFrame(row_rdd, ["list_of_words",'index']) idf = IDF(inputCol="raw_features", outputCol="features") idfModel = idf.fit(result_cv) result_tfidf = idfModel.transform(result_cv)
def process_log_data(spark, input_data, output_data): # get filepath to log data file log_data = os.path.join(input_data, 'log-data/*.json') logSchema = R( [ Fld('aritst', Str()), Fld('auth', Str()), Fld('firstName', Str()), Fld('gender', Str()), Fld('itemInSession', Str()), Fld('lastName', Str()), Fld('length', Dbl()), Fld('level', Str()), Fld('location', Str()), Fld('method', Str()), Fld('page', Str()), Fld('registration', Dbl()), Fld('sessionId', Str()), Fld('song', Str()), Fld('status', Int()), Fld('ts', Int()), Fld('userAgent', Str()), Fld('userId', Int()) ] ) # read log data file df = spark.read.json(log_data, schema=logSchema) df.createTempView('logData') # filter by actions for song plays.parquet df = df.where('page' == 'NextSong') # extract columns for users table users_table = df.select('user_id', 'first_name', 'last_name', 'gender', 'level').\ dropDuplicates(['user_id']) # write users table to parquet files users_table.write.parquet(os.path.join(output_data, 'users')) # create timestamp column from original timestamp column get_timestamp = udf(lambda x: F.to_timestamp(x)) df = df.withColumn('timestamp', get_timestamp('ts')) # create datetime column from original timestamp column get_datetime = udf(lambda x: F.to_date(x)) df = df.withColumn('date', get_datetime('ts')) # extract columns to create time table time_table = df.select('timestamp', 'hour(timestamp)', 'day(timestamp)', 'week(timestamp)', 'month(timestamp)', 'weekeday(timestamp)') time_table.withColumnRenamed('timestamp', 'start_time') time_table.withColumnRenamed('hour(timestamp)', 'hour') time_table.withColumnRenamed('day(timestamp)', 'day') time_table.withColumnRenamed('week(timestamp)', 'week') time_table.withColumnRenamed('month(timestamp)', 'month') time_table.withColumnRenamed('weekday(timestamp', 'weekday') # write time table to parquet files partitioned by year and month time_table.write.parquet('users') # read in song data to use for songplays table song_df = spark.read.parquet(os.path.join(output_data, 'songs')) # extract columns from joined song and log datasets to create songplays table songplays_table = df.join(song_df, df.song == song_df.title). \ select('timestamp', 'user_id', 'level', 'song_id', 'artist_id', 'session_id', 'location', 'user_agent') songplays_table.withColumnRenamed('timestamp', 'start_time') # write songplays table to parquet files partitioned by year and month songplays_table.write.paritionBy('year', 'month(start_time').parquet(os.path.join(output_data, 'songplays')) songplays_table.withColumn('songplay_id', F.monotonicallyIncreasingId())