def calcDSPD(dt, hr, country, stage, upper = dv.upper, lower = dv.lower, thres = dv.thres): SD = SpD(upper = upper, lower = lower, thres = thres) dspd_helper = DSPD(dt=dt,hr=hr, stage=stage, country=country, path=dv.input_path, output=dv.output_path) dat = dspd_helper.load() datn = dat.where("(too_freq_uid!=false or r_s_info is not null) and sl_adjusted_confidence in (94,95)") datn = datn.select(['request_id']+dspd_helper.keys)\ .withColumn("derived_speed",F.lit(dumpJSON(-1.0,-1.0))) for coln in ['m','d','h']: datn = toStr(datn,coln) logging.info("Writing data without derived speed and angle for {}, {}, {}".\ format(dspd_helper.dt,dspd_helper.hr,dspd_helper.cntry)) dspd_helper.write(datn) logging.info("Done writing") datc = dat.where("too_freq_uid=false and r_s_info is null and sl_adjusted_confidence in (94,95)")\ .withColumn("sec",F.round(dat["r_timestamp"]/(1000*float(lower)),0)*float(lower))\ .groupby(['uid','sec'])\ .agg(collect_set(struct(*(['request_id']+dspd_helper.keys))).alias('val_info'),\ F.avg(F.round('latitude',5)).alias('lat'),F.avg(F.round('longitude',5)).alias('long'))\ .groupby('uid')\ .agg(collect_set(struct('sec','val_info','lat','long')).alias('comb')) datc = datc\ .repartition(1000)\ .rdd\ .map(lambda x:SD.getSpeed(x))\ .flatMap(lambda x:x) # schema = StructType(dat.schema.fields+[StructField("derived_speed",StringType(),True)]) datc = spark.createDataFrame(datc,schema=datn.schema) logging.info("Writing data with derived speed and angle for {}, {}, {}".\ format(dspd_helper.dt,dspd_helper.hr,dspd_helper.cntry)) dspd_helper.write(datc) logging.info("Done writing")
def extract_data(self): """Method to extract data from the csv file.""" works_data = self.data_path + '*' works_data_df = self.spark.read.load(works_data, format="csv", header="true") unicode_conversion = udf(lambda value: unicodedata.normalize( 'NFKD', value).encode('ascii', 'ignore').decode()) works_data_df = works_data_df.withColumn( 'converted_title', unicode_conversion(col('title'))) works_data_df = works_data_df.withColumn( 'converted_contributors', unicode_conversion(col('contributors'))) reconciled_data = works_data_df.select('*') \ .groupBy('iswc') \ .agg(concat_ws(', ', collect_set('converted_title')) \ .alias('title'), concat_ws('|', collect_set('converted_contributors')) \ .alias('contributors'), concat_ws(', ', collect_set('source')) \ .alias('sources')) \ .dropDuplicates() \ .na.drop() return reconciled_data
def n_gram_fingerprint_cluster(df, input_cols, n_size=2): """ Cluster a DataFrame column based on the N-Gram Fingerprint algorithm :param df: Dataframe to be processed :param input_cols: Columns to be processed :param n_size: :return: """ input_cols = parse_columns(df, input_cols) for input_col in input_cols: ngram_fingerprint_col = name_col(input_col, NGRAM_FINGERPRINT_COL) # Prepare a group so we do not need to apply the fingerprint to the whole data set df = ( df.select(input_col).groupBy(input_col).count().select( 'count', input_col).repartition( 1) # Needed for optimization in a single machine .cache()) df = n_gram_fingerprint(df, input_col, n_size) count_col = name_col(input_col, COUNT_COL) cluster_col = name_col(input_col, CLUSTER_COL) recommended_col = name_col(input_col, RECOMMENDED_COL) cluster_size_col = name_col(input_col, CLUSTER_SIZE_COL) df = df.groupby(ngram_fingerprint_col).agg( F.collect_set(input_col).alias(cluster_col), F.sum("count").alias(count_col), F.first(input_col).alias(recommended_col), F.size(F.collect_set(input_col)).alias(cluster_size_col)).select( cluster_size_col, cluster_col, count_col, recommended_col) return df
def test_collect_functions(self): df = self.spark.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"]) from pyspark.sql import functions self.assertEqual( sorted( df.select(functions.collect_set( df.key).alias("r")).collect()[0].r), [1, 2]) self.assertEqual( sorted( df.select(functions.collect_list( df.key).alias("r")).collect()[0].r), [1, 1, 1, 2], ) self.assertEqual( sorted( df.select(functions.collect_set( df.value).alias("r")).collect()[0].r), ["1", "2"]) self.assertEqual( sorted( df.select(functions.collect_list( df.value).alias("r")).collect()[0].r), ["1", "2", "2", "2"], )
def main(spark): # The processing code. df = createDataFrame(spark) df.show(truncate=False) rightDf = df.withColumnRenamed("acct", "acct2") \ .withColumnRenamed("bssn", "bssn2") \ .withColumnRenamed("name", "name2") \ .drop("tid") joinedDf = df.join(rightDf, df["acct"] == rightDf["acct2"], "leftsemi") \ .drop(rightDf["acct2"]) \ .drop(rightDf["name2"]) \ .drop(rightDf["bssn2"]) joinedDf.show(truncate=False) listDf = joinedDf.groupBy(F.col("acct")) \ .agg(F.collect_list("bssn"), F.collect_list("name")) listDf.show(truncate=False) setDf = joinedDf.groupBy(F.col("acct")) \ .agg(F.collect_set("bssn"), F.collect_set("name")) setDf.show(truncate=False)
def fingerprint_cluster(df, input_cols): """ Cluster a dataframe column based on the Fingerprint algorithm :param df: Dataframe to be processed :param input_cols: Columns to be processed :return: """ # df = self.df input_cols = parse_columns(df, input_cols) for input_col in input_cols: output_col = name_col(input_col, FINGERPRINT_COL) # Instead of apply the fingerprint to the whole data set we group by names df = ( df.groupBy(input_col).count().select( 'count', input_col).repartition( 1) # Needed for optimization in a single machine .cache()) # Calculate the fingeprint df = fingerprint(df, input_col) count_col = name_col(input_col, COUNT_COL) cluster_col = name_col(input_col, CLUSTER_COL) recommended_col = name_col(input_col, RECOMMENDED_COL) cluster_size_col = name_col(input_col, CLUSTER_SIZE_COL) df = df.groupby(output_col).agg( F.collect_set(input_col).alias(cluster_col), F.sum("count").alias(count_col), F.first(input_col).alias(recommended_col), F.size(F.collect_set(input_col)).alias(cluster_size_col)).select( cluster_size_col, cluster_col, count_col, recommended_col) return df
def n_gram_fingerprint_cluster(df, columns, n_size=2): """ Cluster a DataFrame column based on the N-Gram Fingerprint algorithm :param df: :param columns: :param n_size: :return: """ columns = parse_columns(df, columns) for col_name in columns: n_gram_col = col_name + "_ngram_fingerprint" # Prepare a group so we don need to apply the fingerprint to the whole data set df = ( df.select(col_name).groupBy(col_name).count().select( 'count', col_name).repartition( 1) # Needed for optimization in a single machine .cache()) df = KeyCollision.n_gram_fingerprint(df, col_name, n_size) # df.table() df = df.groupby(n_gram_col).agg( F.collect_set(col_name).alias("cluster"), F.sum("count").alias("count"), F.first(col_name).alias("recommended"), F.size(F.collect_set(col_name)).alias("cluster_size")).select( "cluster_size", "cluster", "count", "recommended") return df
def resolve_image_record_parameter_association( image_record_observation_df: DataFrame, simple_observations_df: DataFrame): simple_df = simple_observations_df.alias("simple") image_df = image_record_observation_df.alias("image").withColumn( "parameterAsc", explode("image.seriesMediaParameterValue.parameterAssociation")) image_vs_simple_parameters_df = image_df.join( simple_df, (col("simple.experiment_id") == col("image.experiment_id")) & (col("simple.parameter_stable_id") == col("parameterAsc._parameterID")), ) image_vs_simple_parameters_df = image_vs_simple_parameters_df.withColumn( "paramName", col("simple.parameter_name")) image_vs_simple_parameters_df = image_vs_simple_parameters_df.withColumn( "paramSeq", lit("0")) image_vs_simple_parameters_df = image_vs_simple_parameters_df.withColumn( "paramValue", when(col("data_point").isNotNull(), col("data_point")).otherwise( when(col("category").isNotNull(), col("category")).otherwise(col("text_value"))), ) image_vs_simple_parameters_df = image_vs_simple_parameters_df.groupBy( col("image.observation_id"), col("image.parameter_stable_id")).agg( collect_set("parameterAsc._parameterID").alias("paramIDs"), collect_set("paramName").alias("paramNames"), collect_set("paramSeq").alias("paramSeqs"), collect_set("paramValue").alias("paramValues"), ) image_vs_simple_parameters_df = image_vs_simple_parameters_df.withColumnRenamed( "observation_id", "img_observation_id").withColumnRenamed("parameter_stable_id", "img_parameter_stable_id") image_vs_simple_parameters_df = image_vs_simple_parameters_df.select( "img_observation_id", "img_parameter_stable_id", "paramIDs", "paramNames", "paramSeqs", "paramValues", ) image_record_observation_df = image_record_observation_df.join( image_vs_simple_parameters_df, (image_record_observation_df["observation_id"] == image_vs_simple_parameters_df["img_observation_id"]) & (image_record_observation_df["parameter_stable_id"] == image_vs_simple_parameters_df["img_parameter_stable_id"]), "left_outer", ) image_record_observation_df = ( image_record_observation_df.withColumnRenamed( "paramIDs", "parameter_association_stable_id").withColumnRenamed( "paramNames", "parameter_association_name").withColumnRenamed( "paramSeqs", "parameter_association_sequence_id").withColumnRenamed( "paramValues", "parameter_association_value")) return image_record_observation_df
def scd_analyze(df, merge_on=None, state_col='_state', updated_col='_updated'): add_ids = '##add_ids' del_ids = '##del_ids' upd_ids = '##upd_ids' c = set(df.columns).difference({state_col, updated_col}) colnames = [x for x in df.columns if x in c] on = merge_on or colnames on = on if isinstance(on, (list, tuple)) else [on] on = [c for c in on if c in colnames] s = on + [state_col, updated_col] cols = [x for x in df.columns if x not in s] a = df.filter(f'{state_col} = 0') \ .groupby(updated_col) \ .agg(F.collect_set(F.concat(*on)).alias(add_ids)) \ .select(updated_col, add_ids) d = df.filter(f'{state_col} = 1') \ .groupby(updated_col) \ .agg(F.collect_set(F.concat(*on)).alias(del_ids)) \ .select(updated_col, del_ids) res = a.join(d, on=updated_col, how='outer') res = res.select(updated_col, F.coalesce(add_ids, F.array([])).alias(add_ids), F.coalesce(del_ids, F.array([])).alias(del_ids)) if cols: agg_funcs = [(F.countDistinct(x) - F.lit(1)).alias(x) for x in cols] cnt = df.groupby(*on, updated_col).agg(*agg_funcs) agg_names = [F.lit(x) for x in cols] agg_sums = [F.sum(x) for x in cols] cnt = cnt.groupby(updated_col).agg( F.map_from_arrays(F.array(*agg_names), F.array(*agg_sums)).alias('changes')) res = res.join(cnt, on=updated_col) else: res = res.withColumn('changes', F.lit(None)) res = res.select('*', F.array_intersect(add_ids, del_ids).alias(upd_ids)) res = res.select( F.col(updated_col).alias('updated'), F.size(upd_ids).alias('upd'), F.size(F.array_except(add_ids, upd_ids)).alias('add'), F.size(F.array_except(del_ids, upd_ids)).alias('del'), 'changes') return res.orderBy('updated')
def generate_TFIDF(sc, df , sqlcontext): # 1. calculate the number of rows(documents) in data framework t_num = df.count() # 2. select _id and lower the text_entry and remove punctuation symbols #and then split it as a list of words('tokens') word_spilits = df.select("_id",F.split(F.lower(F.regexp_replace(df.text_entry,'[^\w\s]' ,'')),' ').alias('tokens')) # 3. explode the list of words to generate a list of _id and token #then, group the list base on _id and token to calculate frequency of tokens (tf) in each row # to create a data framework words_tf (_id , token , tf) words_tf = word_spilits.select("_id", F.explode(word_spilits.tokens).alias('token'))\ .groupBy("_id", "token").agg({'token': 'count'}).withColumnRenamed("count(token)", "tf") # 4. to calculate frequency of token in document (df), I aggregate the list base on token # and created a set of _ids with duplicate _ids eliminated ('collect_set') # and calculated the number of _ids and document frequency of a token # to create a data framework words_df (_id , token , df) words_df = words_tf.groupby("token").agg(F.collect_set("_id").alias("_ids"))\ .select("token", F.explode("_ids").alias('_id'), F.size("_ids").alias('df')) # 5. to calculate the final TFIDF data framework, I joined # I joined two data frameworks words_tf and words_df base on same _id and token # then calculated the idf by fraction of number of documents (t_num) on document frequency (df) # then calculated the tf_idf by multiplying idf and tf tokensWithTfIdf = words_tf.join(words_df, (words_tf._id == words_df._id) & (words_tf.token == words_df.token))\ .select(words_tf._id , words_tf.token, words_tf.tf , words_df.df,(F.log10(t_num / words_df.df )).alias("idf")\ , (F.log10(t_num / words_df.df ) * words_tf.tf ).alias("tf_idf") ) # 6. cache the TFIDF data framework for further usage tokensWithTfIdf.cache() return tokensWithTfIdf
def to_user_reviewed_products( reviews_dataframe: DataFrame, cross_bin_col: str = "cross_bin_number") -> DataFrame: """ TODO: remember to change the corss_bin_col_name return user positive reviews group by user with item index. Args: reviews_dataframe (DataFrame): +-----------+----------+----------------+----------------+-----------------+ |customer_id|product_id|product_id_index|cross_bin_number|customer_id_index| +-----------+----------+----------------+----------------+-----------------+ | 10686361|B000GDBOPQ| 0| 4| 143| | 22517088|B000GF33I0| 1| 6| 2561| | 14770984|B000GFD4C0| 2| 5| 1174| | 40268049|B000GFD4C0| 3| 6| 4776| | 44060334|B000GFD4C0| 4| 4| 5342| +-----------+----------+----------------+----------------+-----------------+ Returns: DataFrame: root |-- customer_id: string (nullable = true) |-- customer_id_index: integer (nullable = true) |-- cross_bin_number: integer (nullable = true) |-- positives_ids: array (nullable = true) | |-- element: integer (containsNull = false) """ return reviews_dataframe.groupby([ "customer_id", "customer_id_index", cross_bin_col ]).agg(F.collect_set("product_id_index").alias("positives_ids"), )
def create_intervals_to_keep(df, window): ''' Creates merged intervals from the significant positions ''' # Create interval column intervals = (df.withColumn( 'interval', array(F.col('pos') - window, F.col('pos') + window)).drop('pos')) interval_reducer_fn = udf(lambda key: interval_reducer(key), ArrayType(ArrayType(IntegerType()))) # Merge intervals m_intervals = (intervals.groupby( 'study_id', 'phenotype_id', 'bio_feature', 'chrom').agg(F.collect_set('interval').alias('intervals')).withColumn( 'intervals', interval_reducer_fn('intervals')).withColumn( 'interval', F.explode('intervals'))) merged_intervals = (m_intervals.withColumn( 'start', m_intervals['interval'][0]).withColumn( 'end', m_intervals['interval'][1]).withColumn( 'start', when(F.col('start') > 0, F.col('start')).otherwise(0)).drop( 'interval', 'intervals')) # merged_intervals.show() return merged_intervals
def algorithm1(i, g): while (True): aggregates = g.aggregateMessages(F.collect_set(AM.msg).alias("agg"), sendToDst=F.when( AM.src['value'] == -1, AM.src["id"])) new_vertices = g.vertices.join( aggregates, on="id", how="left_outer").withColumn( "newValue", getid_maximum_udf2("id", "agg", lit(i), "value")).drop("agg").withColumn( 'max_by_rows', greatest('value', 'newValue')).drop( "value", "newValue").withColumnRenamed( "max_by_rows", "value") cached_new_vertices = AM.getCachedDataFrame(new_vertices) g = GraphFrame(cached_new_vertices, g.edges) i += 1 g.vertices.show() g.vertices.createOrReplaceTempView("temp_table") if (spark.sql("SELECT * from temp_table where value = -1").count() == 0 ): final_df = g.vertices break return final_df
def psm_table(psm, pep, out_path): if not os.path.isdir(out_path): print('The output_path specified does not exist: ' + out_path) sys.exit(1) sql_context = SparkSession.builder.getOrCreate() df_psm = sql_context.read.parquet(psm) df_pep = sql_context.read.parquet(pep) df_pep_exploded = df_pep.select( Fields.PROTEIN_ACCESSION, explode(Fields.PSM_SPECTRUM_ACCESSIONS).alias("psm")) df_pep_select = df_pep_exploded.groupby('psm.usi').agg( functions.collect_set(Fields.PROTEIN_ACCESSION)).toDF( Fields.USI, Fields.PROTEIN_ACCESSION) df_psm_exploded = df_psm.select( Fields.USI, explode(Fields.ADDITIONAL_ATTRIBUTES).alias( Fields.ADDITIONAL_ATTRIBUTES), Fields.ASSAY_ACCESSION, Fields.PEPTIDE_SEQUENCE, Fields.MODIFIED_PEPTIDE_SEQUENCE, Fields.CHARGE, Fields.PRECURSOR_MASS, Fields.IS_DECOY) df_psm_filtered = df_psm_exploded.filter( "additionalAttributes.accession == 'MS:1002355'") df_join = df_psm_filtered.join(df_pep_select, df_psm_filtered.usi == df_pep_select.usi, how='left') \ .select(df_psm_filtered.usi, Fields.ASSAY_ACCESSION, Fields.PEPTIDE_SEQUENCE, Fields.MODIFIED_PEPTIDE_SEQUENCE, Fields.PROTEIN_ACCESSION, 'additionalAttributes.name', 'additionalAttributes.value', Fields.CHARGE, Fields.PRECURSOR_MASS, Fields.IS_DECOY) \ .toDF(psmtable.USI, psmtable.PX_PROJECT_ACCESSION, psmtable.PEPTIDE, psmtable.MODIFIED_PEPTIDE, psmtable.PROTEINS, psmtable.ID_SCORE_NAME, psmtable.ID_SCORE_VALUE, psmtable.CHARGE, psmtable.MASS, psmtable.IS_DECOY) # df_join.show(truncate=False) df_join.write.parquet(out_path, mode='append', compression='snappy')
def algorithm2(i, g): while (True): aggregates = g.aggregateMessages(F.collect_set(AM.msg).alias("agg"), sendToDst=F.when( AM.src['value'] == -1, AM.src["id"])) new_vertices = g.vertices.join( aggregates, on="id", how="left_outer").withColumn( "newValue", getid_maximum_udf2("id", "agg", lit(i), "value")).drop("agg").withColumn( 'max_by_rows', greatest('value', 'newValue')).drop( "value", "newValue").withColumnRenamed( "max_by_rows", "value") cached_new_vertices = AM.getCachedDataFrame(new_vertices) g = GraphFrame(cached_new_vertices, g.edges) i += 1 g.vertices.show() if (g.filterVertices( "value == -1").dropIsolatedVertices().edges.count() == 0): final_df = g.vertices final_df = final_df.withColumn( "value", F.when(final_df["value"] == -1, i).otherwise(final_df["value"])) break return final_df
def main(input_dir,output_dir): # main logic starts here df_schema = types.StructType([ types.StructField('title_clean', types.StringType()), types.StructField('title', types.StringType()), types.StructField('created_utc_iso', types.DateType()), types.StructField('polarity_subjectivity', types.ArrayType(types.FloatType())) ]) headlines_df = spark.read.json(input_dir,encoding='utf-8',schema=df_schema).repartition(80) split_sentiment_df = headlines_df.withColumn( 'polarity', functions.element_at(headlines_df['polarity_subjectivity'],1) ).withColumn( 'subjectivity', functions.element_at(headlines_df['polarity_subjectivity'],2) ).cache() for year_int in range(2008,2020): print('Plotting for '+str(year_int)) headlines_year = split_sentiment_df.where( functions.year(split_sentiment_df['created_utc_iso']) == year_int ).withColumn('year',functions.year(split_sentiment_df['created_utc_iso'])) headlines_grouped = headlines_year.groupBy(headlines_year['year']).agg( functions.collect_set(headlines_year['title_clean']).alias('titles_group') ) headlines_joined = headlines_grouped.select( functions.array_join(headlines_grouped['titles_group'],' ').alias('joined') ) string_to_plot = headlines_joined.collect()[0]['joined'] #only one row remaining of concatenated headlines wordcloud = WordCloud(background_color='white', stopwords=stopwords, width=1000, height=500).generate(string_to_plot) wordcloud.to_file(output_dir + '/'+str(year_int)+'_words.png')
def runAggregateFunctions(spark, df1, df2): # collect_list, collect_set doubledDf1 = df1.union(df1) doubledDf1.select(functions.collect_list( doubledDf1["name"])).show(truncate=False) doubledDf1.select(functions.collect_set( doubledDf1["name"])).show(truncate=False) # count, countDistinct doubledDf1.select(functions.count(doubledDf1["name"]), functions.countDistinct( doubledDf1["name"])).show(truncate=False) # sum df2.printSchema() df2.select(sum(df2["price"])).show(truncate=False) # grouping, grouping_id df2.cube(df2["store"], df2["product"]).agg(sum(df2["amount"]), grouping(df2["store"])).show(truncate=False) df2.cube(df2["store"], df2["product"]).agg( sum(df2["amount"]), grouping_id(df2["store"], df2["product"])).show(truncate=False) # grouping_id를 이용한 정렬 df2.cube(df2["store"], df2["product"]) \ .agg(sum("amount").alias("sum"), grouping_id("store", "product").alias("gid")) \ .filter("gid != '2'") \ .sort(asc("store"), col("gid")) \ .na.fill({"store":"Total", "product":"-"}) \ .select("store", "product", "sum") \ .show(truncate=False)
def feature_imp_pyspark(self): num_var = [i[0] for i in self.data_frame.dtypes if ((i[1]=='int') | (i[1]=='double')) & (i[0]!=self.target)] num_var = [col for col in num_var if not col.endswith('indexed')] # labels_count = [len(self.data_frame.select(col).distinct().collect()) for col in num_var] labels_count = [len(self.data_frame.agg((F.collect_set(col).alias(col))).first().asDict()[col]) for col in num_var] labels_count.sort() max_count = labels_count[-1] #one_hot = [col for col in self.data_frame.columns if col.endswith('_indexed_encoded')] #num_var.extend(one_hot) label_indexes = StringIndexer(inputCol = self.target , outputCol = 'label', handleInvalid = 'keep') assembler = VectorAssembler(inputCols = num_var , outputCol = "features") if self.problem_type == 'REGRESSION': model = RandomForestRegressor(labelCol="label", \ featuresCol="features", seed = 8464,\ numTrees=10, cacheNodeIds = True,\ subsamplingRate = 0.7) else: model = RandomForestClassifier(labelCol="label", \ featuresCol="features", seed = 8464,\ numTrees=10, cacheNodeIds = True,\ subsamplingRate = 0.7,maxBins = max_count+2) pipe = Pipeline(stages =[assembler, label_indexes, model]) mod_fit = pipe.fit(self.data_frame) df2 = mod_fit.transform(self.data_frame) cols = MLUtils.ExtractFeatureImp(mod_fit.stages[-1].featureImportances, df2, "features") cols_considered = cols.loc[cols['score'] > 0] cols_considered = list(cols_considered['name']) #tree_fs = list(set(cols_considered) & set(self.data_frame.columns)) #tree_fs.extend(list(set([encoded for encoded in one_hot for column in cols_considered if column.startswith(encoded)]))) self.data_change_dict['SelectedColsTree'] = cols_considered if self.target not in cols_considered: cols_considered.append(self.target) return cols_considered
def levels(self) -> list: """ Names of index columns in list. .. note:: Be aware of the possibility of running into out of memory issue if returned list is huge. Examples -------- >>> mi = pd.MultiIndex.from_arrays((list('abc'), list('def'))) >>> mi.names = ['level_1', 'level_2'] >>> kdf = ks.DataFrame({'a': [1, 2, 3]}, index=mi) >>> kdf.index.levels [['a', 'b', 'c'], ['d', 'e', 'f']] >>> mi = pd.MultiIndex.from_arrays((list('bac'), list('fee'))) >>> mi.names = ['level_1', 'level_2'] >>> kdf = ks.DataFrame({'a': [1, 2, 3]}, index=mi) >>> kdf.index.levels [['a', 'b', 'c'], ['e', 'f']] """ scols = self._kdf._internal.index_scols row = self._kdf._sdf.select([F.collect_set(scol) for scol in scols]).first() # use sorting is because pandas doesn't care the appearance order of level # names, so e.g. if ['b', 'd', 'a'] will return as ['a', 'b', 'd'] return [sorted(col) for col in row]
def main(): spark = SparkSession \ .builder \ .getOrCreate() spark.sparkContext.setCheckpointDir('gs://reddit_data_soen498/checkpoint/') @udf("boolean") def isNotDefault(x): defaultSubs = ["Art", "AskReddit", "DIY", "Documentaries", "EarthPorn", "Futurology", "GetMotivated", "IAmA", "InternetIsBeautiful", "Jokes", "LifeProTips", "Music", "OldSchoolCool", "Showerthoughts", "UpliftingNews", "announcements", "askscience", "aww", "blog", "books", "creepy", "dataisbeautiful", "explainlikeimfive", "food", "funny", "gadgets", "gaming", "gifs", "history", "listentothis", "mildlyinteresting", "movies", "news", "nosleep", "nottheonion", "personalfinance", "philosophy", "photoshopbattles", "pics", "science", "space", "sports", "television", "tifu", "todayilearned", "videos", "worldnews"] return x not in defaultSubs data = spark.read.json("gs://reddit_data_soen498/RC_2018-02.json") keep = [data.author, data.id, data.subreddit] data = data.select(*keep) data = data.filter(data.author != "[deleted]") data = data.filter(isNotDefault(data.subreddit)) data = data.groupBy(data.author).agg(F.collect_set("subreddit").alias("items")) size_ = udf(lambda xs: len(xs), IntegerType()) data = data.filter(size_(data.items) > 1) data = data.select(data.items) support = 200/data.count() fp = FPGrowth(minSupport=support, minConfidence=0.5) fpm = fp.fit(data) fpm.associationRules.show(100) fpm.save("gs://reddit_data_soen498/modelFP_noDefaultSub_20support")
def main(keyspace, outdir, orderkeys): # main logic starts here order_df = spark.read.format("org.apache.spark.sql.cassandra").options( table='orders', keyspace=keyspace).load() order_df.createOrReplaceTempView('orders') part_df = spark.read.format("org.apache.spark.sql.cassandra").options( table='part', keyspace=keyspace).load() part_df.createOrReplaceTempView('part') lineitem_df = spark.read.format("org.apache.spark.sql.cassandra").options( table='lineitem', keyspace=keyspace).load() lineitem_df.createOrReplaceTempView('lineitem') summary_table = spark.sql((''' SELECT o.orderkey, o.totalprice, p.name FROM orders o JOIN lineitem l ON o.orderkey = l.orderkey JOIN part p ON p.partkey = l.partkey WHERE o.orderkey IN %s ORDER BY o.orderkey, p.name ''' % orderkeys).replace('[', '(').replace(']', ')')) group_table = summary_table.groupBy('orderkey', 'totalprice').agg( functions.collect_set('name')) group_table = group_table.orderBy(group_table.orderkey) group_table.explain() order_rdd = group_table.rdd.map(output_line) order_rdd.saveAsTextFile(outdir)
def nunique(self, df): """ Calculates number of unique values in a column over a window""" w = self.get_window(self.partition_by, self.order_by, self.window_length) return df.withColumn( self.column_alias, psf.size(psf.collect_set(self.aggregation_column).over(w)))
def main(keyspace, outdir, orderkeys): # Create Orders view orders_df = spark.read.format("org.apache.spark.sql.cassandra").options( table='orders', keyspace=keyspace).load() orders_df.createOrReplaceTempView('orders') # Create Parts view part_df = spark.read.format("org.apache.spark.sql.cassandra").options( table='part', keyspace=keyspace).load() part_df.createOrReplaceTempView('part') # Create LineItems view line_item_df = spark.read.format("org.apache.spark.sql.cassandra").options( table='lineitem', keyspace=keyspace).load() line_item_df.createOrReplaceTempView('lineitem') # Join the tables with SQL query join_table = spark.sql(''' select o.orderkey,o.totalprice, p.name from Orders o join lineitem l on o.orderkey = l.orderkey join part p ON l.partkey = p.partkey where o.orderkey in ''' + str(orderkeys)) # Make parts from same orders into single row and parts as comma separated formatted_summary = join_table.groupBy('orderkey', 'totalprice').\ agg(functions.collect_set('name')).alias('names') formatted_summary = formatted_summary.orderBy(formatted_summary.orderkey) formatted_summary.show() lines = formatted_summary.rdd lines = lines.map(output_line) lines.coalesce(1).saveAsTextFile(outdir)
def evaluation(df, model, ks): ''' Evaluate the model. ks: a list of parameter k used in precision at k and NDCG at k. ''' print(' Make predictions...') predictions = model.recommendForUserSubset(df, 500) print(' Prepare ground truth set and predicted set...') labels = df.groupBy('user').agg(F.collect_set('item')).collect() user_pred = predictions.select('user','recommendations.item').rdd.flatMap(lambda x:[x]).collect() labels = sorted(labels, key = lambda x: x.user) user_pred = sorted(user_pred, key = lambda x: x.user) print(' Combine ground truth set and predicted set...') predictionAndLabels = [] for i in range(len(user_pred)): predictionAndLabels.append((user_pred[i].item, labels[i][1])) print(' Parallelize...') predictionAndLabels = sc.parallelize(predictionAndLabels, numSlices=2000) print(' Calculate metrics...') metrics = RankingMetrics(predictionAndLabels) eval_results = [] eval_results.append(metrics.meanAveragePrecision) for k in ks: eval_results.append(metrics.precisionAt(k)) eval_results.append(metrics.ndcgAt(k)) return eval_results
def main(key_space, outdir, orderkeys): Where_condition = tuple([int(x) for x in orderkeys]) orders_df = spark.read.format("org.apache.spark.sql.cassandra").options( table='orders', keyspace=key_space).load() orders_df.createOrReplaceTempView('orders') line_item_df = spark.read.format("org.apache.spark.sql.cassandra").options( table='lineitem', keyspace=key_space).load() line_item_df.createOrReplaceTempView('lineitem') parts_df = spark.read.format("org.apache.spark.sql.cassandra").options( table='part', keyspace=key_space).load() parts_df.createOrReplaceTempView('part') query = """SELECT o.* ,p.name FROM orders o JOIN lineitem l ON (o.orderkey = l.orderkey) JOIN part p ON (p.partkey = l.partkey) WHERE o.orderkey IN {}""".format( Where_condition) join_df = spark.sql(query) join_df = join_df.groupBy(join_df['orderkey'], join_df['totalprice']).agg( f.collect_set(join_df['name'])) join_df.explain() join_rdd = join_df.rdd #join_rdd.take(10) join_rdd = join_rdd.map(output_line) #join_rdd.take(10) join_rdd.saveAsTextFile(outdir)
def test_collect_functions(self): df = self.spark.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"]) from pyspark.sql import functions self.assertEqual( sorted(df.select(functions.collect_set(df.key).alias('r')).collect()[0].r), [1, 2]) self.assertEqual( sorted(df.select(functions.collect_list(df.key).alias('r')).collect()[0].r), [1, 1, 1, 2]) self.assertEqual( sorted(df.select(functions.collect_set(df.value).alias('r')).collect()[0].r), ["1", "2"]) self.assertEqual( sorted(df.select(functions.collect_list(df.value).alias('r')).collect()[0].r), ["1", "2", "2", "2"])
def query2(df, beg, end): to_full_name = udf(lambda x: states[x.upper()], StringType()) return df.filter(col('time').between(beg, end)).filter(col('group_country')=='us')\ .withColumn('state', to_full_name('group_state')).groupBy(col('state').alias('state'))\ .agg(collect_set('group_name'))
def read_guid(): guids = spark.read.json('guids', multiLine=True).repartition(10) guids = guids.select('results.guid', 'results.genres') guids = guids.select('guid', functions.explode('genres').alias('genres')) guids = guids.select('guid', 'genres.name') guids = guids.groupBy('guid').agg(functions.collect_set('name')) guids.coalesce(1).write.json('game_genre', mode='overwrite')
def main(keyspace, output_directory, order_keys): df_orders = get_df_for_table("orders", keyspace) df_lineitem = get_df_for_table("lineitem", keyspace) df_part = get_df_for_table("part", keyspace) # join dataframes together df_joined = df_orders.join(df_lineitem, df_orders['orderkey'] == df_lineitem['orderkey'], 'inner') \ .join(df_part, df_lineitem['partkey'] == df_part['partkey'], 'inner') \ .select(df_orders['orderkey'], df_orders['totalprice'], df_part['name']) # get order keys sent via input df_filtered = df_joined.where(df_joined['orderkey'].isin(order_keys)) # order data df_sorted = df_filtered.orderBy(df_filtered['orderkey']) # group data and collect parts df_final = df_sorted.groupBy(df_sorted['orderkey'], df_sorted['totalprice']) \ .agg(functions.collect_set(df_sorted['name']).alias('partnames')) # explain plan df_final.explain() # convert to rdd rdd_results = df_final.rdd # apply output format function to all rows rdd_outpt = rdd_results.map(output_line) # write to output directory rdd_outpt.coalesce(1).saveAsTextFile(output_directory)
def main(user_id, output, orderkeys): #Create a spark Dataframe object by reading the Cassandra table orders_df = spark.read.format("org.apache.spark.sql.cassandra").options( table='orders', keyspace=user_id).load() #Keep only values which match the orderkeys given by the user orders_df = orders_df.filter(orders_df['orderkey'].isin(orderkeys)) line_df = spark.read.format("org.apache.spark.sql.cassandra").options( table='lineitem', keyspace=user_id).load() line_df = line_df.filter(line_df['orderkey'].isin(orderkeys)) part_df = spark.read.format("org.apache.spark.sql.cassandra").options( table='part', keyspace=user_id).load() #Conditions for joining tables condition1 = ['orderkey'] tpch_df1 = orders_df.join(line_df, condition1, 'inner') condition2 = ['partkey'] tpch_df2 = tpch_df1.join(part_df, condition2, 'inner') tpch_df2.show() #The joined table with ambiguous columns filtered out super_table = tpch_df2.filter(tpch_df2['orderkey'].isin(orderkeys)) final_table = super_table.groupBy( super_table['orderkey'], super_table['totalprice']).agg( functions.collect_set(super_table['name'])) final_table.explain() out = final_table.rdd.sortBy(lambda x: x[0]).map(output_line).coalesce(1) out.saveAsTextFile(output)
def evaluateTopk(model,data,top_k=500): ''' Input: validation: RDD - user, product (book_id), rating ''' truth=spark.createDataFrame(data).groupby("user").agg(F.collect_set("product")) print("Getting Predictions...") tmp1=model.recommendProductsForUsers(top_k).map(lambda r: [r[0],[k.product for k in r[1]]]) predictions=spark.createDataFrame(tmp1,["user","predictions"]) print("Predictions and Labels...") k=predictions.join(truth,truth.user==predictions.user) final=k.rdd.map(lambda r: [r[1],r[3]]) metrics=RankingMetrics(final) print("\nCalculate NDCG at {}...".format(top_k)) res1=metrics.ndcgAt(top_k) print("NDCG at {}: {}".format(top_k,res1)) print("\nCalculate MAP...") res2=metrics.meanAveragePrecision print("MAP: {}".format(res2)) print("\nCalculate Precision at {}...".format(top_k)) res3=metrics.precisionAt(top_k) print("Precision at {}: {}".format(top_k,res1)) return res1,res2,res3
def runAggregateFunctions(spark, df1, df2): # collect_list, collect_set doubledDf1 = df1.union(df1) doubledDf1.select(functions.collect_list(doubledDf1["name"])).show(truncate=False) doubledDf1.select(functions.collect_set(doubledDf1["name"])).show(truncate=False) # count, countDistinct doubledDf1.select(functions.count(doubledDf1["name"]), functions.countDistinct(doubledDf1["name"])).show( truncate=False) # sum df2.printSchema() df2.select(sum(df2["price"])).show(truncate=False) # grouping, grouping_id df2.cube(df2["store"], df2["product"]).agg(sum(df2["amount"]), grouping(df2["store"])).show(truncate=False) df2.cube(df2["store"], df2["product"]).agg(sum(df2["amount"]), grouping_id(df2["store"], df2["product"])).show( truncate=False)
d2 = d1.toDF("number", "name", "SI", "GOO", "DONG", "x", "y", "b_code", "h_code", "utmk_x", "utmk_y", "wtm_x", "wtm_y") d3 = d2.select(d2.GOO.alias("loc"), d2.x, d2.y) d3.show(5, False) indexer = StringIndexer(inputCol="loc", outputCol="loccode") assembler = VectorAssembler(inputCols=["loccode", "x", "y"], outputCol="features") kmeans = KMeans(k=5, seed=1, featuresCol="features") pipeline = Pipeline(stages=[indexer, assembler, kmeans]) model = pipeline.fit(d3) d4 = model.transform(d3) d4.groupBy("prediction") \ .agg(functions.collect_set("loc").alias("loc")) \ .orderBy("prediction").show(100, False) WSSSE = model.stages[2].computeCost(d4) print("Within Set Sum of Squared Errors = %d" % WSSSE) print("Cluster Centers: ") for v in model.stages[2].clusterCenters(): print(v) spark.stop
from pyspark.sql.functions import skewness, kurtosis df.select(skewness("Quantity"), kurtosis("Quantity")).show() # COMMAND ---------- from pyspark.sql.functions import corr, covar_pop, covar_samp df.select(corr("InvoiceNo", "Quantity"), covar_samp("InvoiceNo", "Quantity"), covar_pop("InvoiceNo", "Quantity")).show() # COMMAND ---------- from pyspark.sql.functions import collect_set, collect_list df.agg(collect_set("Country"), collect_list("Country")).show() # COMMAND ---------- from pyspark.sql.functions import count df.groupBy("InvoiceNo").agg( count("Quantity").alias("quan"), expr("count(Quantity)")).show() # COMMAND ---------- df.groupBy("InvoiceNo").agg(expr("avg(Quantity)"),expr("stddev_pop(Quantity)"))\ .show()
df_joined1.count() # Descriptive Stats df_joined1.describe().show(10,False) #################################################################################################################### # # Model Prep # #################################################################################################################### order_list = df_order_products__train \ .select(['order_id','product_id']) \ .groupby("order_id") \ .agg(collect_set("product_id")) \ .withColumnRenamed('collect_set(product_id)','product_set') order_list.show(20,False) #(training, test) = df_joined1.randomSplit([0.8, 0.2]) #################################################################################################################### # # Train Model # #################################################################################################################### fpGrowth = FPGrowth(itemsCol="product_set", minSupport=0.01, minConfidence=0.05) model = fpGrowth.fit(order_list)