def join_clusters_with_lda_data():
    from pyspark import SparkContext, SparkConf
    from pyspark.sql import SQLContext, Row
    from pyspark.sql.functions import monotonicallyIncreasingId

    conf = (SparkConf()
      .set("spark.driver.maxResultSize", "20g")\
      .set("spark.rpc.askTimeout", "120000")\
      .set("spark.executor.heartbeatInterval", "120000"))

    sc = SparkContext(appName='join_clusters_with_lda_data', conf=conf)
    sqlContext = SQLContext(sc)

    data_path = "/user/rmusters/lda_doc_topic"
    cluster_path = "/user/rmusters/lda_clusters"

    data = sqlContext.read.parquet(data_path)
    data.show()
    data = data.withColumn("id", monotonicallyIncreasingId())

    cluster = sqlContext.read.parquet(
        cluster_path
    )  # .sample(False, 0.00001)#.map(lambda x: Row(cluster=x[0]))
    cluster.show()
    cluster = cluster.withColumn("id", monotonicallyIncreasingId())

    result = data.join(cluster, on="id")
    result = result.drop("id")
    result = result.withColumnRenamed("_1",
                                      "id").withColumnRenamed("_2", "vectors")
    result.write.parquet("hdfs:///user/rmusters/bisecting_lda_data_cluster",
                         mode="overwrite")
    result.show()
def write_data(path):
	import filter
	from pyspark.mllib.feature import Word2Vec, Word2VecModel

	# load data
	loc = '/user/rmusters/text/2015/01/*'
	text_file = sc.textFile(loc)
	data = text_file.map(lambda line: filter.filter(line).split(" "))

	# load model
	word2vec = Word2Vec()
	model = Word2VecModel.load(sc, '/user/rmusters/2015model99')

	# get a tweet vector pair.
	from pyspark.sql import SQLContext
	sqlContext = SQLContext(sc)
	lookup = sqlContext.read.parquet('/user/rmusters/2015model99/data').alias("lookup")
	lookup_bd = sc.broadcast(lookup.rdd.collectAsMap())

	vectors = data.map(lambda ws: [lookup_bd.value.get(w) for w in ws])
	logger.info(vectors.count())

	data = text_file.map(lambda line: (line, filter.filter(line).split(" ")))\
							.map(lambda (text, filtered): (text, filtered, [lookup_bd.value.get(w) for w in filtered][0]))

	from pyspark.sql.functions import monotonicallyIncreasingId
	df = data.toDF(["text", "filtered_text", "vectors"])
	# This will return a new DF with all the columns + id
	res = df.withColumn("id", monotonicallyIncreasingId())
	res.write.parquet(path, mode="overwrite")
def generate_pk(p_col_pk, p_df):
	"""
	Args: p_col_pk - Priamry Key column name
		  p_df - Data Frame for which to create PK column
	Returns: Spark DataFrame with new PK column
	"""
	dfPK = p_df.withColumn(p_col_pk, monotonicallyIncreasingId()+1)
	return dfPK
Exemple #4
0
def canonical_format(df, Item):
    # 1. map to name, id, and manufacturer dictionary
    # 2. map to row number, line tuples
    # 3. collect to a map to provide easy lookup for training algorithm
    def to_dict(row):
        item = Item()
        item.populate(row)
        return item.to_dict()
    df = df.withColumn('id', monotonicallyIncreasingId())
    return (df, df.map(lambda row: (row.id, to_dict(row))).collectAsMap())
def data_jan():
    from pyspark.sql.functions import monotonicallyIncreasingId
    loc = '/user/rmusters/text/2015/01/*'
    text_file = sc.textFile(loc)
    data = text_file.map(lambda (text): (text, filter.filter(text)))
    data = data.toDF(["text",
                      "filtered_text"]).withColumn("id",
                                                   monotonicallyIncreasingId())

    path = '/user/rmusters/data_jan'
    data.write.parquet(path, mode="overwrite")
    lit(None).cast(StringType()))

# read in readmission data
measure_readmission = hive_context.table("default.measure_readmission")
measure_readmission.registerTempTable("readmission_temp")
measure_readmission_selected = hive_context.sql(
    " SELECT provider_id, measure_id,denominator,score,measure_start,measure_end FROM readmission_temp"
)
# prepare readmissions for a union with effective measures: add empty column for sample
measure_readmission_with_sample = measure_readmission_selected.withColumn(
    'sample',
    lit(None).cast(StringType()))
# add a column to flag the data as readmission
measure_readmission_with_sample_care_type = measure_readmission_with_sample.withColumn(
    'care_type',
    lit("readmission").cast(StringType()))

readmission_ordered = measure_readmission_with_sample_care_type.select(
    "provider_id", "measure_id", "score", "measure_start", "measure_end",
    "sample", "denominator", "care_type")
effective_ordered = measure_effective_with_care_type_denominator.select(
    "provider_id", "measure_id", "score", "measure_start", "measure_end",
    "sample", "denominator", "care_type")

# merge timely and effective and readmitted measures
procedure_temp = effective_ordered.unionAll(readmission_ordered)
# add id
procedure = procedure_temp.select(
    monotonicallyIncreasingId().alias("procedure_id"), "*")
procedure.write.saveAsTable("procedure", mode="overwrite")
Exemple #7
0
   .load(os.path.realpath("Womens Clothing E-Commerce Reviews.csv"))

reviews = data.map(lambda x : x['Review Text']).filter(lambda x: x is not None)

tokens = reviews                                                   \
    .map( lambda document: document.strip().lower())               \
    .map( lambda document: re.split(" ", document))          \
    .map( lambda word: [x for x in word if x.isalpha()])           \
    .map( lambda word: [x for x in word if len(x) > 3] )           \
    .map( lambda word: [x for x in word if x not in StopWords])    \
    .zipWithIndex()



row_rdd = rdd1.map(lambda x: Row(x))
df=sqlContext.createDataFrame(row_rdd,['numbers']).show()
cv = CountVectorizer(inputCol="words", outputCol="features", vocabSize=3, minDF=2.0)
model = cv.fit(df)
result = model.transform(df)
result.show(truncate=False)
from pyspark.sql.functions import monotonicallyIncreasingId
res = df.withColumn("id", monotonicallyIncreasingId())
df_txts = sqlContext.createDataFrame(row_rdd, ["list_of_words",'index'])

idf = IDF(inputCol="raw_features", outputCol="features")
idfModel = idf.fit(result_cv)
result_tfidf = idfModel.transform(result_cv) 



def process_log_data(spark, input_data, output_data):
    # get filepath to log data file
    log_data = os.path.join(input_data, 'log-data/*.json')
    logSchema = R(
            [
                Fld('aritst', Str()),
                Fld('auth', Str()),
                Fld('firstName', Str()),
                Fld('gender', Str()),
                Fld('itemInSession', Str()),
                Fld('lastName', Str()),
                Fld('length', Dbl()),
                Fld('level', Str()),
                Fld('location', Str()),
                Fld('method', Str()),
                Fld('page', Str()),
                Fld('registration', Dbl()),
                Fld('sessionId', Str()),
                Fld('song', Str()),
                Fld('status', Int()),
                Fld('ts', Int()),
                Fld('userAgent', Str()),
                Fld('userId', Int())
            ]
    )
    # read log data file
    df = spark.read.json(log_data, schema=logSchema)
    df.createTempView('logData')

    # filter by actions for song plays.parquet
    df = df.where('page' == 'NextSong')

    # extract columns for users table
    users_table = df.select('user_id', 'first_name', 'last_name', 'gender', 'level').\
        dropDuplicates(['user_id'])

    # write users table to parquet files
    users_table.write.parquet(os.path.join(output_data, 'users'))

    # create timestamp column from original timestamp column
    get_timestamp = udf(lambda x: F.to_timestamp(x))
    df = df.withColumn('timestamp', get_timestamp('ts'))

    # create datetime column from original timestamp column
    get_datetime = udf(lambda x: F.to_date(x))
    df = df.withColumn('date', get_datetime('ts'))

    # extract columns to create time table
    time_table = df.select('timestamp', 'hour(timestamp)',
                           'day(timestamp)', 'week(timestamp)',
                           'month(timestamp)', 'weekeday(timestamp)')

    time_table.withColumnRenamed('timestamp', 'start_time')
    time_table.withColumnRenamed('hour(timestamp)', 'hour')
    time_table.withColumnRenamed('day(timestamp)', 'day')
    time_table.withColumnRenamed('week(timestamp)', 'week')
    time_table.withColumnRenamed('month(timestamp)', 'month')
    time_table.withColumnRenamed('weekday(timestamp', 'weekday')
    # write time table to parquet files partitioned by year and month
    time_table.write.parquet('users')

    # read in song data to use for songplays table
    song_df = spark.read.parquet(os.path.join(output_data, 'songs'))

    # extract columns from joined song and log datasets to create songplays table
    songplays_table = df.join(song_df, df.song == song_df.title). \
        select('timestamp', 'user_id', 'level', 'song_id',
               'artist_id', 'session_id', 'location', 'user_agent')

    songplays_table.withColumnRenamed('timestamp', 'start_time')
    # write songplays table to parquet files partitioned by year and month
    songplays_table.write.paritionBy('year', 'month(start_time').parquet(os.path.join(output_data, 'songplays'))
    songplays_table.withColumn('songplay_id', F.monotonicallyIncreasingId())