Esempio n. 1
0
def calcDSPD(dt, hr, country, stage, upper = dv.upper, lower = dv.lower, thres = dv.thres):
    SD = SpD(upper = upper, lower = lower, thres = thres)

    dspd_helper = DSPD(dt=dt,hr=hr, stage=stage, country=country, path=dv.input_path, output=dv.output_path)
    dat = dspd_helper.load()
    datn = dat.where("(too_freq_uid!=false or r_s_info is not null) and sl_adjusted_confidence in (94,95)")
    datn = datn.select(['request_id']+dspd_helper.keys)\
            .withColumn("derived_speed",F.lit(dumpJSON(-1.0,-1.0)))
    for coln in ['m','d','h']:
        datn = toStr(datn,coln)

    logging.info("Writing data without derived speed and angle for {}, {}, {}".\
            format(dspd_helper.dt,dspd_helper.hr,dspd_helper.cntry))
    dspd_helper.write(datn)
    logging.info("Done writing")

    datc = dat.where("too_freq_uid=false and r_s_info is null and sl_adjusted_confidence in (94,95)")\
            .withColumn("sec",F.round(dat["r_timestamp"]/(1000*float(lower)),0)*float(lower))\
            .groupby(['uid','sec'])\
            .agg(collect_set(struct(*(['request_id']+dspd_helper.keys))).alias('val_info'),\
                F.avg(F.round('latitude',5)).alias('lat'),F.avg(F.round('longitude',5)).alias('long'))\
            .groupby('uid')\
            .agg(collect_set(struct('sec','val_info','lat','long')).alias('comb'))
    datc = datc\
            .repartition(1000)\
            .rdd\
            .map(lambda x:SD.getSpeed(x))\
            .flatMap(lambda x:x)
#    schema = StructType(dat.schema.fields+[StructField("derived_speed",StringType(),True)])
    datc = spark.createDataFrame(datc,schema=datn.schema)
    logging.info("Writing data with derived speed and angle for {}, {}, {}".\
        format(dspd_helper.dt,dspd_helper.hr,dspd_helper.cntry))
    dspd_helper.write(datc)
    logging.info("Done writing")
Esempio n. 2
0
    def extract_data(self):
        """Method to extract data from the csv file."""

        works_data = self.data_path + '*'

        works_data_df = self.spark.read.load(works_data,
                                             format="csv",
                                             header="true")
        unicode_conversion = udf(lambda value: unicodedata.normalize(
            'NFKD', value).encode('ascii', 'ignore').decode())

        works_data_df = works_data_df.withColumn(
            'converted_title', unicode_conversion(col('title')))

        works_data_df = works_data_df.withColumn(
            'converted_contributors', unicode_conversion(col('contributors')))

        reconciled_data = works_data_df.select('*') \
                                            .groupBy('iswc') \
                                            .agg(concat_ws(', ', collect_set('converted_title')) \
                                            .alias('title'),
                                            concat_ws('|', collect_set('converted_contributors')) \
                                            .alias('contributors'),
                                            concat_ws(', ', collect_set('source')) \
                                            .alias('sources')) \
                                            .dropDuplicates() \
                                            .na.drop()

        return reconciled_data
Esempio n. 3
0
def n_gram_fingerprint_cluster(df, input_cols, n_size=2):
    """
    Cluster a DataFrame column based on the N-Gram Fingerprint algorithm
    :param df: Dataframe to be processed
    :param input_cols: Columns to be processed
    :param n_size:
    :return:
    """
    input_cols = parse_columns(df, input_cols)
    for input_col in input_cols:
        ngram_fingerprint_col = name_col(input_col, NGRAM_FINGERPRINT_COL)

        # Prepare a group so we do not need to apply the fingerprint to the whole data set
        df = (
            df.select(input_col).groupBy(input_col).count().select(
                'count', input_col).repartition(
                    1)  # Needed for optimization in a single machine
            .cache())

        df = n_gram_fingerprint(df, input_col, n_size)

        count_col = name_col(input_col, COUNT_COL)
        cluster_col = name_col(input_col, CLUSTER_COL)
        recommended_col = name_col(input_col, RECOMMENDED_COL)
        cluster_size_col = name_col(input_col, CLUSTER_SIZE_COL)

        df = df.groupby(ngram_fingerprint_col).agg(
            F.collect_set(input_col).alias(cluster_col),
            F.sum("count").alias(count_col),
            F.first(input_col).alias(recommended_col),
            F.size(F.collect_set(input_col)).alias(cluster_size_col)).select(
                cluster_size_col, cluster_col, count_col, recommended_col)

        return df
Esempio n. 4
0
    def test_collect_functions(self):
        df = self.spark.createDataFrame([(1, "1"), (2, "2"), (1, "2"),
                                         (1, "2")], ["key", "value"])
        from pyspark.sql import functions

        self.assertEqual(
            sorted(
                df.select(functions.collect_set(
                    df.key).alias("r")).collect()[0].r), [1, 2])
        self.assertEqual(
            sorted(
                df.select(functions.collect_list(
                    df.key).alias("r")).collect()[0].r),
            [1, 1, 1, 2],
        )
        self.assertEqual(
            sorted(
                df.select(functions.collect_set(
                    df.value).alias("r")).collect()[0].r), ["1", "2"])
        self.assertEqual(
            sorted(
                df.select(functions.collect_list(
                    df.value).alias("r")).collect()[0].r),
            ["1", "2", "2", "2"],
        )
Esempio n. 5
0
def main(spark):
    # The processing code.
    df = createDataFrame(spark)
    df.show(truncate=False)

    rightDf = df.withColumnRenamed("acct", "acct2") \
        .withColumnRenamed("bssn", "bssn2") \
        .withColumnRenamed("name", "name2") \
        .drop("tid")

    joinedDf = df.join(rightDf, df["acct"] == rightDf["acct2"], "leftsemi") \
        .drop(rightDf["acct2"]) \
        .drop(rightDf["name2"]) \
        .drop(rightDf["bssn2"])

    joinedDf.show(truncate=False)

    listDf = joinedDf.groupBy(F.col("acct")) \
        .agg(F.collect_list("bssn"), F.collect_list("name"))

    listDf.show(truncate=False)

    setDf = joinedDf.groupBy(F.col("acct")) \
        .agg(F.collect_set("bssn"), F.collect_set("name"))

    setDf.show(truncate=False)
Esempio n. 6
0
def fingerprint_cluster(df, input_cols):
    """
    Cluster a dataframe column based on the Fingerprint algorithm
    :param df: Dataframe to be processed
    :param input_cols: Columns to be processed
    :return:
    """
    # df = self.df
    input_cols = parse_columns(df, input_cols)

    for input_col in input_cols:
        output_col = name_col(input_col, FINGERPRINT_COL)
        # Instead of apply the fingerprint to the whole data set we group by names
        df = (
            df.groupBy(input_col).count().select(
                'count', input_col).repartition(
                    1)  # Needed for optimization in a single machine
            .cache())
        # Calculate the fingeprint
        df = fingerprint(df, input_col)

        count_col = name_col(input_col, COUNT_COL)
        cluster_col = name_col(input_col, CLUSTER_COL)
        recommended_col = name_col(input_col, RECOMMENDED_COL)
        cluster_size_col = name_col(input_col, CLUSTER_SIZE_COL)

        df = df.groupby(output_col).agg(
            F.collect_set(input_col).alias(cluster_col),
            F.sum("count").alias(count_col),
            F.first(input_col).alias(recommended_col),
            F.size(F.collect_set(input_col)).alias(cluster_size_col)).select(
                cluster_size_col, cluster_col, count_col, recommended_col)
    return df
Esempio n. 7
0
    def n_gram_fingerprint_cluster(df, columns, n_size=2):
        """
        Cluster a DataFrame column based on the N-Gram Fingerprint algorithm
        :param df:
        :param columns:
        :param n_size:
        :return:
        """
        columns = parse_columns(df, columns)
        for col_name in columns:
            n_gram_col = col_name + "_ngram_fingerprint"

            # Prepare a group so we don need to apply the fingerprint to the whole data set
            df = (
                df.select(col_name).groupBy(col_name).count().select(
                    'count', col_name).repartition(
                        1)  # Needed for optimization in a single machine
                .cache())

            df = KeyCollision.n_gram_fingerprint(df, col_name, n_size)
            # df.table()
            df = df.groupby(n_gram_col).agg(
                F.collect_set(col_name).alias("cluster"),
                F.sum("count").alias("count"),
                F.first(col_name).alias("recommended"),
                F.size(F.collect_set(col_name)).alias("cluster_size")).select(
                    "cluster_size", "cluster", "count", "recommended")

            return df
Esempio n. 8
0
def resolve_image_record_parameter_association(
        image_record_observation_df: DataFrame,
        simple_observations_df: DataFrame):
    simple_df = simple_observations_df.alias("simple")
    image_df = image_record_observation_df.alias("image").withColumn(
        "parameterAsc",
        explode("image.seriesMediaParameterValue.parameterAssociation"))
    image_vs_simple_parameters_df = image_df.join(
        simple_df,
        (col("simple.experiment_id") == col("image.experiment_id"))
        & (col("simple.parameter_stable_id")
           == col("parameterAsc._parameterID")),
    )
    image_vs_simple_parameters_df = image_vs_simple_parameters_df.withColumn(
        "paramName", col("simple.parameter_name"))
    image_vs_simple_parameters_df = image_vs_simple_parameters_df.withColumn(
        "paramSeq", lit("0"))
    image_vs_simple_parameters_df = image_vs_simple_parameters_df.withColumn(
        "paramValue",
        when(col("data_point").isNotNull(), col("data_point")).otherwise(
            when(col("category").isNotNull(),
                 col("category")).otherwise(col("text_value"))),
    )
    image_vs_simple_parameters_df = image_vs_simple_parameters_df.groupBy(
        col("image.observation_id"), col("image.parameter_stable_id")).agg(
            collect_set("parameterAsc._parameterID").alias("paramIDs"),
            collect_set("paramName").alias("paramNames"),
            collect_set("paramSeq").alias("paramSeqs"),
            collect_set("paramValue").alias("paramValues"),
        )
    image_vs_simple_parameters_df = image_vs_simple_parameters_df.withColumnRenamed(
        "observation_id",
        "img_observation_id").withColumnRenamed("parameter_stable_id",
                                                "img_parameter_stable_id")
    image_vs_simple_parameters_df = image_vs_simple_parameters_df.select(
        "img_observation_id",
        "img_parameter_stable_id",
        "paramIDs",
        "paramNames",
        "paramSeqs",
        "paramValues",
    )
    image_record_observation_df = image_record_observation_df.join(
        image_vs_simple_parameters_df,
        (image_record_observation_df["observation_id"]
         == image_vs_simple_parameters_df["img_observation_id"])
        & (image_record_observation_df["parameter_stable_id"]
           == image_vs_simple_parameters_df["img_parameter_stable_id"]),
        "left_outer",
    )
    image_record_observation_df = (
        image_record_observation_df.withColumnRenamed(
            "paramIDs", "parameter_association_stable_id").withColumnRenamed(
                "paramNames", "parameter_association_name").withColumnRenamed(
                    "paramSeqs",
                    "parameter_association_sequence_id").withColumnRenamed(
                        "paramValues", "parameter_association_value"))
    return image_record_observation_df
Esempio n. 9
0
def scd_analyze(df, merge_on=None, state_col='_state', updated_col='_updated'):
    add_ids = '##add_ids'
    del_ids = '##del_ids'
    upd_ids = '##upd_ids'

    c = set(df.columns).difference({state_col, updated_col})
    colnames = [x for x in df.columns if x in c]

    on = merge_on or colnames
    on = on if isinstance(on, (list, tuple)) else [on]
    on = [c for c in on if c in colnames]

    s = on + [state_col, updated_col]
    cols = [x for x in df.columns if x not in s]

    a = df.filter(f'{state_col} = 0') \
        .groupby(updated_col) \
        .agg(F.collect_set(F.concat(*on)).alias(add_ids)) \
        .select(updated_col, add_ids)

    d = df.filter(f'{state_col} = 1') \
        .groupby(updated_col) \
        .agg(F.collect_set(F.concat(*on)).alias(del_ids)) \
        .select(updated_col, del_ids)

    res = a.join(d, on=updated_col, how='outer')
    res = res.select(updated_col,
                     F.coalesce(add_ids, F.array([])).alias(add_ids),
                     F.coalesce(del_ids, F.array([])).alias(del_ids))

    if cols:
        agg_funcs = [(F.countDistinct(x) - F.lit(1)).alias(x) for x in cols]
        cnt = df.groupby(*on, updated_col).agg(*agg_funcs)

        agg_names = [F.lit(x) for x in cols]
        agg_sums = [F.sum(x) for x in cols]
        cnt = cnt.groupby(updated_col).agg(
            F.map_from_arrays(F.array(*agg_names),
                              F.array(*agg_sums)).alias('changes'))

        res = res.join(cnt, on=updated_col)
    else:
        res = res.withColumn('changes', F.lit(None))

    res = res.select('*', F.array_intersect(add_ids, del_ids).alias(upd_ids))
    res = res.select(
        F.col(updated_col).alias('updated'),
        F.size(upd_ids).alias('upd'),
        F.size(F.array_except(add_ids, upd_ids)).alias('add'),
        F.size(F.array_except(del_ids, upd_ids)).alias('del'), 'changes')

    return res.orderBy('updated')
Esempio n. 10
0
def generate_TFIDF(sc, df , sqlcontext):
  
	# 1. calculate the number of rows(documents) in data framework
    t_num = df.count()    
    
	# 2. select _id and lower the text_entry and remove punctuation symbols 
	#and then split it as a list of words('tokens')
    word_spilits = df.select("_id",F.split(F.lower(F.regexp_replace(df.text_entry,'[^\w\s]' ,'')),' ').alias('tokens'))
	
	# 3. explode the list of words to generate a list of _id and token
	#then, group the list base on _id and token to calculate frequency of tokens (tf) in each row
	# to create a  data framework words_tf (_id , token , tf)
    words_tf = word_spilits.select("_id", F.explode(word_spilits.tokens).alias('token'))\	
	.groupBy("_id", "token").agg({'token': 'count'}).withColumnRenamed("count(token)", "tf")
	
	# 4. to calculate frequency of token in document (df), I aggregate the list base on token 
	# and created a set of _ids with duplicate _ids eliminated ('collect_set')
	# and calculated the number of _ids and document frequency of a token
	# to create a data framework words_df (_id , token , df)
    words_df = words_tf.groupby("token").agg(F.collect_set("_id").alias("_ids"))\
	.select("token", F.explode("_ids").alias('_id'), F.size("_ids").alias('df'))
    
	# 5. to calculate the final TFIDF data framework, I joined
	# I joined two data frameworks words_tf and words_df base on same _id and token
	# then calculated the idf by fraction of number of documents (t_num) on document frequency (df)
	# then calculated the tf_idf by multiplying idf and tf
    tokensWithTfIdf = words_tf.join(words_df, (words_tf._id == words_df._id) &  (words_tf.token == words_df.token))\
    .select(words_tf._id , words_tf.token, words_tf.tf , words_df.df,(F.log10(t_num / words_df.df )).alias("idf")\
	, (F.log10(t_num / words_df.df ) * words_tf.tf ).alias("tf_idf") )
    
	# 6. cache the TFIDF data framework for further usage 
    tokensWithTfIdf.cache()
    return tokensWithTfIdf
Esempio n. 11
0
def to_user_reviewed_products(
        reviews_dataframe: DataFrame,
        cross_bin_col: str = "cross_bin_number") -> DataFrame:
    """
    TODO: remember to change the corss_bin_col_name
    return user positive reviews group by user with item index.

    Args:
        reviews_dataframe (DataFrame):
        +-----------+----------+----------------+----------------+-----------------+
        |customer_id|product_id|product_id_index|cross_bin_number|customer_id_index|
        +-----------+----------+----------------+----------------+-----------------+
        |   10686361|B000GDBOPQ|               0|               4|              143|
        |   22517088|B000GF33I0|               1|               6|             2561|
        |   14770984|B000GFD4C0|               2|               5|             1174|
        |   40268049|B000GFD4C0|               3|               6|             4776|
        |   44060334|B000GFD4C0|               4|               4|             5342|
        +-----------+----------+----------------+----------------+-----------------+
    Returns:
        DataFrame:
        root
        |-- customer_id: string (nullable = true)
        |-- customer_id_index: integer (nullable = true)
        |-- cross_bin_number: integer (nullable = true)
        |-- positives_ids: array (nullable = true)
        |    |-- element: integer (containsNull = false)

    """

    return reviews_dataframe.groupby([
        "customer_id", "customer_id_index", cross_bin_col
    ]).agg(F.collect_set("product_id_index").alias("positives_ids"), )
def create_intervals_to_keep(df, window):
    ''' Creates merged intervals from the significant positions
    '''

    # Create interval column
    intervals = (df.withColumn(
        'interval', array(F.col('pos') - window,
                          F.col('pos') + window)).drop('pos'))

    interval_reducer_fn = udf(lambda key: interval_reducer(key),
                              ArrayType(ArrayType(IntegerType())))
    # Merge intervals
    m_intervals = (intervals.groupby(
        'study_id', 'phenotype_id', 'bio_feature',
        'chrom').agg(F.collect_set('interval').alias('intervals')).withColumn(
            'intervals', interval_reducer_fn('intervals')).withColumn(
                'interval', F.explode('intervals')))

    merged_intervals = (m_intervals.withColumn(
        'start', m_intervals['interval'][0]).withColumn(
            'end', m_intervals['interval'][1]).withColumn(
                'start',
                when(F.col('start') > 0, F.col('start')).otherwise(0)).drop(
                    'interval', 'intervals'))

    # merged_intervals.show()

    return merged_intervals
def algorithm1(i, g):
    while (True):
        aggregates = g.aggregateMessages(F.collect_set(AM.msg).alias("agg"),
                                         sendToDst=F.when(
                                             AM.src['value'] == -1,
                                             AM.src["id"]))

        new_vertices = g.vertices.join(
            aggregates, on="id", how="left_outer").withColumn(
                "newValue",
                getid_maximum_udf2("id", "agg", lit(i),
                                   "value")).drop("agg").withColumn(
                                       'max_by_rows',
                                       greatest('value', 'newValue')).drop(
                                           "value",
                                           "newValue").withColumnRenamed(
                                               "max_by_rows", "value")
        cached_new_vertices = AM.getCachedDataFrame(new_vertices)
        g = GraphFrame(cached_new_vertices, g.edges)
        i += 1
        g.vertices.show()
        g.vertices.createOrReplaceTempView("temp_table")
        if (spark.sql("SELECT * from temp_table where value = -1").count() == 0
            ):
            final_df = g.vertices
            break
    return final_df
Esempio n. 14
0
def psm_table(psm, pep, out_path):
    if not os.path.isdir(out_path):
        print('The output_path specified does not exist: ' + out_path)
        sys.exit(1)

    sql_context = SparkSession.builder.getOrCreate()
    df_psm = sql_context.read.parquet(psm)
    df_pep = sql_context.read.parquet(pep)

    df_pep_exploded = df_pep.select(
        Fields.PROTEIN_ACCESSION,
        explode(Fields.PSM_SPECTRUM_ACCESSIONS).alias("psm"))
    df_pep_select = df_pep_exploded.groupby('psm.usi').agg(
        functions.collect_set(Fields.PROTEIN_ACCESSION)).toDF(
            Fields.USI, Fields.PROTEIN_ACCESSION)

    df_psm_exploded = df_psm.select(
        Fields.USI,
        explode(Fields.ADDITIONAL_ATTRIBUTES).alias(
            Fields.ADDITIONAL_ATTRIBUTES), Fields.ASSAY_ACCESSION,
        Fields.PEPTIDE_SEQUENCE, Fields.MODIFIED_PEPTIDE_SEQUENCE,
        Fields.CHARGE, Fields.PRECURSOR_MASS, Fields.IS_DECOY)
    df_psm_filtered = df_psm_exploded.filter(
        "additionalAttributes.accession == 'MS:1002355'")

    df_join = df_psm_filtered.join(df_pep_select, df_psm_filtered.usi == df_pep_select.usi, how='left') \
      .select(df_psm_filtered.usi, Fields.ASSAY_ACCESSION, Fields.PEPTIDE_SEQUENCE, Fields.MODIFIED_PEPTIDE_SEQUENCE,
              Fields.PROTEIN_ACCESSION, 'additionalAttributes.name', 'additionalAttributes.value', Fields.CHARGE,
              Fields.PRECURSOR_MASS, Fields.IS_DECOY) \
      .toDF(psmtable.USI, psmtable.PX_PROJECT_ACCESSION, psmtable.PEPTIDE, psmtable.MODIFIED_PEPTIDE, psmtable.PROTEINS,
            psmtable.ID_SCORE_NAME, psmtable.ID_SCORE_VALUE, psmtable.CHARGE, psmtable.MASS, psmtable.IS_DECOY)
    # df_join.show(truncate=False)
    df_join.write.parquet(out_path, mode='append', compression='snappy')
def algorithm2(i, g):
    while (True):
        aggregates = g.aggregateMessages(F.collect_set(AM.msg).alias("agg"),
                                         sendToDst=F.when(
                                             AM.src['value'] == -1,
                                             AM.src["id"]))

        new_vertices = g.vertices.join(
            aggregates, on="id", how="left_outer").withColumn(
                "newValue",
                getid_maximum_udf2("id", "agg", lit(i),
                                   "value")).drop("agg").withColumn(
                                       'max_by_rows',
                                       greatest('value', 'newValue')).drop(
                                           "value",
                                           "newValue").withColumnRenamed(
                                               "max_by_rows", "value")
        cached_new_vertices = AM.getCachedDataFrame(new_vertices)
        g = GraphFrame(cached_new_vertices, g.edges)
        i += 1
        g.vertices.show()
        if (g.filterVertices(
                "value == -1").dropIsolatedVertices().edges.count() == 0):
            final_df = g.vertices
            final_df = final_df.withColumn(
                "value",
                F.when(final_df["value"] == -1,
                       i).otherwise(final_df["value"]))
            break
    return final_df
Esempio n. 16
0
def main(input_dir,output_dir):
    # main logic starts here
    df_schema = types.StructType([
        types.StructField('title_clean', types.StringType()),
        types.StructField('title', types.StringType()),
        types.StructField('created_utc_iso', types.DateType()),
        types.StructField('polarity_subjectivity', types.ArrayType(types.FloatType()))
    ])

    headlines_df = spark.read.json(input_dir,encoding='utf-8',schema=df_schema).repartition(80)
    split_sentiment_df = headlines_df.withColumn(
        'polarity', functions.element_at(headlines_df['polarity_subjectivity'],1)
    ).withColumn(
        'subjectivity', functions.element_at(headlines_df['polarity_subjectivity'],2)
    ).cache()

    for year_int in range(2008,2020):
        print('Plotting for '+str(year_int))
        headlines_year = split_sentiment_df.where(
            functions.year(split_sentiment_df['created_utc_iso']) == year_int
        ).withColumn('year',functions.year(split_sentiment_df['created_utc_iso']))

        headlines_grouped = headlines_year.groupBy(headlines_year['year']).agg(
            functions.collect_set(headlines_year['title_clean']).alias('titles_group')
        )
        headlines_joined = headlines_grouped.select( functions.array_join(headlines_grouped['titles_group'],' ').alias('joined') )
        string_to_plot = headlines_joined.collect()[0]['joined'] #only one row remaining of concatenated headlines

        wordcloud = WordCloud(background_color='white', stopwords=stopwords, width=1000, height=500).generate(string_to_plot)
        wordcloud.to_file(output_dir + '/'+str(year_int)+'_words.png')
Esempio n. 17
0
def runAggregateFunctions(spark, df1, df2):
    # collect_list, collect_set
    doubledDf1 = df1.union(df1)
    doubledDf1.select(functions.collect_list(
        doubledDf1["name"])).show(truncate=False)
    doubledDf1.select(functions.collect_set(
        doubledDf1["name"])).show(truncate=False)

    # count, countDistinct
    doubledDf1.select(functions.count(doubledDf1["name"]),
                      functions.countDistinct(
                          doubledDf1["name"])).show(truncate=False)

    # sum
    df2.printSchema()
    df2.select(sum(df2["price"])).show(truncate=False)

    # grouping, grouping_id
    df2.cube(df2["store"],
             df2["product"]).agg(sum(df2["amount"]),
                                 grouping(df2["store"])).show(truncate=False)
    df2.cube(df2["store"], df2["product"]).agg(
        sum(df2["amount"]), grouping_id(df2["store"],
                                        df2["product"])).show(truncate=False)

    # grouping_id를 이용한 정렬
    df2.cube(df2["store"], df2["product"]) \
        .agg(sum("amount").alias("sum"), grouping_id("store", "product").alias("gid")) \
        .filter("gid != '2'") \
        .sort(asc("store"), col("gid")) \
        .na.fill({"store":"Total", "product":"-"}) \
        .select("store", "product", "sum") \
        .show(truncate=False)
Esempio n. 18
0
    def feature_imp_pyspark(self):
        num_var = [i[0] for i in self.data_frame.dtypes if ((i[1]=='int') | (i[1]=='double')) & (i[0]!=self.target)]
        num_var = [col for col in num_var if not col.endswith('indexed')]
        # labels_count = [len(self.data_frame.select(col).distinct().collect()) for col in num_var]
        labels_count = [len(self.data_frame.agg((F.collect_set(col).alias(col))).first().asDict()[col]) for col in num_var]
        labels_count.sort()
        max_count =  labels_count[-1]
        #one_hot = [col for col in self.data_frame.columns if col.endswith('_indexed_encoded')]
        #num_var.extend(one_hot)
        label_indexes = StringIndexer(inputCol = self.target , outputCol = 'label', handleInvalid = 'keep')
        assembler = VectorAssembler(inputCols = num_var , outputCol = "features")
        if self.problem_type == 'REGRESSION':
            model = RandomForestRegressor(labelCol="label", \
                                     featuresCol="features", seed = 8464,\
                                     numTrees=10, cacheNodeIds = True,\
                                     subsamplingRate = 0.7)
        else:
            model = RandomForestClassifier(labelCol="label", \
                                     featuresCol="features", seed = 8464,\
                                     numTrees=10, cacheNodeIds = True,\
                                     subsamplingRate = 0.7,maxBins = max_count+2)
        pipe = Pipeline(stages =[assembler, label_indexes, model])

        mod_fit = pipe.fit(self.data_frame)
        df2 = mod_fit.transform(self.data_frame)
        cols = MLUtils.ExtractFeatureImp(mod_fit.stages[-1].featureImportances, df2, "features")
        cols_considered = cols.loc[cols['score'] > 0]
        cols_considered = list(cols_considered['name'])
        #tree_fs = list(set(cols_considered) & set(self.data_frame.columns))
        #tree_fs.extend(list(set([encoded for encoded in one_hot for column in cols_considered if column.startswith(encoded)])))
        self.data_change_dict['SelectedColsTree'] = cols_considered
        if self.target not in cols_considered:
            cols_considered.append(self.target)
        return cols_considered
Esempio n. 19
0
    def levels(self) -> list:
        """
        Names of index columns in list.

        .. note:: Be aware of the possibility of running into out
            of memory issue if returned list is huge.

        Examples
        --------
        >>> mi = pd.MultiIndex.from_arrays((list('abc'), list('def')))
        >>> mi.names = ['level_1', 'level_2']
        >>> kdf = ks.DataFrame({'a': [1, 2, 3]}, index=mi)
        >>> kdf.index.levels
        [['a', 'b', 'c'], ['d', 'e', 'f']]

        >>> mi = pd.MultiIndex.from_arrays((list('bac'), list('fee')))
        >>> mi.names = ['level_1', 'level_2']
        >>> kdf = ks.DataFrame({'a': [1, 2, 3]}, index=mi)
        >>> kdf.index.levels
        [['a', 'b', 'c'], ['e', 'f']]
        """
        scols = self._kdf._internal.index_scols
        row = self._kdf._sdf.select([F.collect_set(scol)
                                     for scol in scols]).first()

        # use sorting is because pandas doesn't care the appearance order of level
        # names, so e.g. if ['b', 'd', 'a'] will return as ['a', 'b', 'd']
        return [sorted(col) for col in row]
Esempio n. 20
0
def main():

    spark = SparkSession \
        .builder \
        .getOrCreate()

    spark.sparkContext.setCheckpointDir('gs://reddit_data_soen498/checkpoint/')
    
    @udf("boolean")
    def isNotDefault(x):
        defaultSubs = ["Art", "AskReddit", "DIY", "Documentaries", "EarthPorn", "Futurology", "GetMotivated", "IAmA", "InternetIsBeautiful", "Jokes", "LifeProTips", "Music", "OldSchoolCool", "Showerthoughts", "UpliftingNews", "announcements", "askscience", "aww", "blog", "books", "creepy", "dataisbeautiful", "explainlikeimfive", "food", "funny", "gadgets", "gaming", "gifs", "history", "listentothis", "mildlyinteresting", "movies", "news", "nosleep", "nottheonion", "personalfinance", "philosophy", "photoshopbattles", "pics", "science", "space", "sports", "television", "tifu", "todayilearned", "videos", "worldnews"]
        return x not in defaultSubs
    
    data = spark.read.json("gs://reddit_data_soen498/RC_2018-02.json")
    keep = [data.author, data.id, data.subreddit]
    data = data.select(*keep)
    data = data.filter(data.author != "[deleted]")
    data = data.filter(isNotDefault(data.subreddit))

    data = data.groupBy(data.author).agg(F.collect_set("subreddit").alias("items"))
    size_ = udf(lambda xs: len(xs), IntegerType())
    data = data.filter(size_(data.items) > 1)
    data = data.select(data.items)
    support = 200/data.count()
    fp = FPGrowth(minSupport=support, minConfidence=0.5)
    fpm = fp.fit(data)
    fpm.associationRules.show(100)
    
    fpm.save("gs://reddit_data_soen498/modelFP_noDefaultSub_20support")
Esempio n. 21
0
def main(keyspace, outdir, orderkeys):

    # main logic starts here

    order_df = spark.read.format("org.apache.spark.sql.cassandra").options(
        table='orders', keyspace=keyspace).load()
    order_df.createOrReplaceTempView('orders')

    part_df = spark.read.format("org.apache.spark.sql.cassandra").options(
        table='part', keyspace=keyspace).load()
    part_df.createOrReplaceTempView('part')

    lineitem_df = spark.read.format("org.apache.spark.sql.cassandra").options(
        table='lineitem', keyspace=keyspace).load()
    lineitem_df.createOrReplaceTempView('lineitem')

    summary_table = spark.sql(('''
    SELECT o.orderkey, o.totalprice, p.name
    FROM orders o JOIN lineitem l ON o.orderkey = l.orderkey JOIN part p ON p.partkey = l.partkey
    WHERE o.orderkey IN %s
    ORDER BY o.orderkey, p.name
    ''' % orderkeys).replace('[', '(').replace(']', ')'))

    group_table = summary_table.groupBy('orderkey', 'totalprice').agg(
        functions.collect_set('name'))

    group_table = group_table.orderBy(group_table.orderkey)
    group_table.explain()

    order_rdd = group_table.rdd.map(output_line)
    order_rdd.saveAsTextFile(outdir)
Esempio n. 22
0
 def nunique(self, df):
     """ Calculates number of unique values in a column over a window"""
     w = self.get_window(self.partition_by, self.order_by,
                         self.window_length)
     return df.withColumn(
         self.column_alias,
         psf.size(psf.collect_set(self.aggregation_column).over(w)))
Esempio n. 23
0
def main(keyspace, outdir, orderkeys):
    # Create Orders view
    orders_df = spark.read.format("org.apache.spark.sql.cassandra").options(
        table='orders', keyspace=keyspace).load()
    orders_df.createOrReplaceTempView('orders')

    # Create Parts view
    part_df = spark.read.format("org.apache.spark.sql.cassandra").options(
        table='part', keyspace=keyspace).load()
    part_df.createOrReplaceTempView('part')

    # Create LineItems view
    line_item_df = spark.read.format("org.apache.spark.sql.cassandra").options(
        table='lineitem', keyspace=keyspace).load()
    line_item_df.createOrReplaceTempView('lineitem')

    # Join the tables with SQL query
    join_table = spark.sql('''
	                       select o.orderkey,o.totalprice, p.name from Orders o 
	                       join lineitem l on o.orderkey = l.orderkey
	                       join part p ON l.partkey = p.partkey
	                       where o.orderkey in
	                       ''' + str(orderkeys))

    # Make parts from same orders into single row and parts as comma separated
    formatted_summary = join_table.groupBy('orderkey', 'totalprice').\
        agg(functions.collect_set('name')).alias('names')
    formatted_summary = formatted_summary.orderBy(formatted_summary.orderkey)
    formatted_summary.show()
    lines = formatted_summary.rdd
    lines = lines.map(output_line)
    lines.coalesce(1).saveAsTextFile(outdir)
Esempio n. 24
0
def evaluation(df, model, ks):
	'''
	Evaluate the model.
	ks: a list of parameter k used in precision at k and NDCG at k.
	'''

	print(' Make predictions...')
	predictions = model.recommendForUserSubset(df, 500)

	print(' Prepare ground truth set and predicted set...')
	labels = df.groupBy('user').agg(F.collect_set('item')).collect()
	user_pred = predictions.select('user','recommendations.item').rdd.flatMap(lambda x:[x]).collect()
	labels = sorted(labels, key = lambda x: x.user)
	user_pred = sorted(user_pred, key = lambda x: x.user)
	print(' Combine ground truth set and predicted set...')
	predictionAndLabels = []
	for i in range(len(user_pred)):
		predictionAndLabels.append((user_pred[i].item, labels[i][1]))
	print(' Parallelize...')
	predictionAndLabels = sc.parallelize(predictionAndLabels, numSlices=2000)
	print(' Calculate metrics...')
	metrics = RankingMetrics(predictionAndLabels)
	eval_results = []
	eval_results.append(metrics.meanAveragePrecision)
	for k in ks:
		eval_results.append(metrics.precisionAt(k))
		eval_results.append(metrics.ndcgAt(k))

	return eval_results
Esempio n. 25
0
def main(key_space, outdir, orderkeys):

    Where_condition = tuple([int(x) for x in orderkeys])

    orders_df = spark.read.format("org.apache.spark.sql.cassandra").options(
        table='orders', keyspace=key_space).load()
    orders_df.createOrReplaceTempView('orders')

    line_item_df = spark.read.format("org.apache.spark.sql.cassandra").options(
        table='lineitem', keyspace=key_space).load()
    line_item_df.createOrReplaceTempView('lineitem')

    parts_df = spark.read.format("org.apache.spark.sql.cassandra").options(
        table='part', keyspace=key_space).load()
    parts_df.createOrReplaceTempView('part')

    query = """SELECT o.*  ,p.name FROM orders o JOIN lineitem l ON (o.orderkey = l.orderkey) JOIN part p ON (p.partkey = l.partkey) WHERE o.orderkey IN {}""".format(
        Where_condition)
    join_df = spark.sql(query)
    join_df = join_df.groupBy(join_df['orderkey'], join_df['totalprice']).agg(
        f.collect_set(join_df['name']))
    join_df.explain()

    join_rdd = join_df.rdd
    #join_rdd.take(10)
    join_rdd = join_rdd.map(output_line)
    #join_rdd.take(10)
    join_rdd.saveAsTextFile(outdir)
Esempio n. 26
0
    def test_collect_functions(self):
        df = self.spark.createDataFrame([(1, "1"), (2, "2"), (1, "2"), (1, "2")], ["key", "value"])
        from pyspark.sql import functions

        self.assertEqual(
            sorted(df.select(functions.collect_set(df.key).alias('r')).collect()[0].r),
            [1, 2])
        self.assertEqual(
            sorted(df.select(functions.collect_list(df.key).alias('r')).collect()[0].r),
            [1, 1, 1, 2])
        self.assertEqual(
            sorted(df.select(functions.collect_set(df.value).alias('r')).collect()[0].r),
            ["1", "2"])
        self.assertEqual(
            sorted(df.select(functions.collect_list(df.value).alias('r')).collect()[0].r),
            ["1", "2", "2", "2"])
Esempio n. 27
0
def query2(df, beg, end):

    to_full_name = udf(lambda x: states[x.upper()], StringType())

    return df.filter(col('time').between(beg, end)).filter(col('group_country')=='us')\
             .withColumn('state', to_full_name('group_state')).groupBy(col('state').alias('state'))\
             .agg(collect_set('group_name'))
Esempio n. 28
0
def read_guid():
    guids = spark.read.json('guids', multiLine=True).repartition(10)
    guids = guids.select('results.guid', 'results.genres')
    guids = guids.select('guid', functions.explode('genres').alias('genres'))
    guids = guids.select('guid', 'genres.name')
    guids = guids.groupBy('guid').agg(functions.collect_set('name'))
    guids.coalesce(1).write.json('game_genre', mode='overwrite')
Esempio n. 29
0
def main(keyspace, output_directory, order_keys):
    df_orders = get_df_for_table("orders", keyspace)
    df_lineitem = get_df_for_table("lineitem", keyspace)
    df_part = get_df_for_table("part", keyspace)

    # join dataframes together
    df_joined = df_orders.join(df_lineitem, df_orders['orderkey'] == df_lineitem['orderkey'], 'inner') \
                         .join(df_part, df_lineitem['partkey'] == df_part['partkey'], 'inner') \
                         .select(df_orders['orderkey'], df_orders['totalprice'], df_part['name'])

    # get order keys sent via input
    df_filtered = df_joined.where(df_joined['orderkey'].isin(order_keys))

    # order data
    df_sorted = df_filtered.orderBy(df_filtered['orderkey'])

    # group data and collect parts
    df_final = df_sorted.groupBy(df_sorted['orderkey'], df_sorted['totalprice']) \
                        .agg(functions.collect_set(df_sorted['name']).alias('partnames'))

    # explain plan
    df_final.explain()

    # convert to rdd
    rdd_results = df_final.rdd

    # apply output format function to all rows
    rdd_outpt = rdd_results.map(output_line)

    # write to output directory
    rdd_outpt.coalesce(1).saveAsTextFile(output_directory)
Esempio n. 30
0
def main(user_id, output, orderkeys):
    #Create a spark Dataframe object by reading the Cassandra table
    orders_df = spark.read.format("org.apache.spark.sql.cassandra").options(
        table='orders', keyspace=user_id).load()
    #Keep only values which match the orderkeys given by the user
    orders_df = orders_df.filter(orders_df['orderkey'].isin(orderkeys))

    line_df = spark.read.format("org.apache.spark.sql.cassandra").options(
        table='lineitem', keyspace=user_id).load()
    line_df = line_df.filter(line_df['orderkey'].isin(orderkeys))

    part_df = spark.read.format("org.apache.spark.sql.cassandra").options(
        table='part', keyspace=user_id).load()
    #Conditions for joining tables
    condition1 = ['orderkey']
    tpch_df1 = orders_df.join(line_df, condition1, 'inner')
    condition2 = ['partkey']
    tpch_df2 = tpch_df1.join(part_df, condition2, 'inner')
    tpch_df2.show()

    #The joined table with ambiguous columns filtered out
    super_table = tpch_df2.filter(tpch_df2['orderkey'].isin(orderkeys))
    final_table = super_table.groupBy(
        super_table['orderkey'], super_table['totalprice']).agg(
            functions.collect_set(super_table['name']))
    final_table.explain()
    out = final_table.rdd.sortBy(lambda x: x[0]).map(output_line).coalesce(1)
    out.saveAsTextFile(output)
Esempio n. 31
0
def evaluateTopk(model,data,top_k=500):
    '''
    Input:
    validation: RDD
        - user, product (book_id), rating
    '''
    truth=spark.createDataFrame(data).groupby("user").agg(F.collect_set("product"))
    print("Getting Predictions...")
    tmp1=model.recommendProductsForUsers(top_k).map(lambda r: [r[0],[k.product for k in r[1]]])
    predictions=spark.createDataFrame(tmp1,["user","predictions"])


    print("Predictions and Labels...")
    k=predictions.join(truth,truth.user==predictions.user)
    final=k.rdd.map(lambda r: [r[1],r[3]])
    metrics=RankingMetrics(final)

    print("\nCalculate NDCG at {}...".format(top_k))
    res1=metrics.ndcgAt(top_k)
    print("NDCG at {}: {}".format(top_k,res1))

    print("\nCalculate MAP...")
    res2=metrics.meanAveragePrecision
    print("MAP: {}".format(res2))

    print("\nCalculate Precision at {}...".format(top_k))
    res3=metrics.precisionAt(top_k)
    print("Precision at {}: {}".format(top_k,res1))

    return res1,res2,res3
Esempio n. 32
0
def runAggregateFunctions(spark, df1, df2):
    # collect_list, collect_set
    doubledDf1 = df1.union(df1)
    doubledDf1.select(functions.collect_list(doubledDf1["name"])).show(truncate=False)
    doubledDf1.select(functions.collect_set(doubledDf1["name"])).show(truncate=False)

    # count, countDistinct
    doubledDf1.select(functions.count(doubledDf1["name"]), functions.countDistinct(doubledDf1["name"])).show(
        truncate=False)

    # sum
    df2.printSchema()
    df2.select(sum(df2["price"])).show(truncate=False)

    # grouping, grouping_id
    df2.cube(df2["store"], df2["product"]).agg(sum(df2["amount"]), grouping(df2["store"])).show(truncate=False)
    df2.cube(df2["store"], df2["product"]).agg(sum(df2["amount"]), grouping_id(df2["store"], df2["product"])).show(
        truncate=False)
Esempio n. 33
0
d2 = d1.toDF("number", "name", "SI", "GOO", "DONG", "x", "y", "b_code", "h_code", "utmk_x", "utmk_y", "wtm_x", "wtm_y")

d3 = d2.select(d2.GOO.alias("loc"), d2.x, d2.y)

d3.show(5, False)

indexer = StringIndexer(inputCol="loc", outputCol="loccode")

assembler = VectorAssembler(inputCols=["loccode", "x", "y"], outputCol="features")

kmeans = KMeans(k=5, seed=1, featuresCol="features")

pipeline = Pipeline(stages=[indexer, assembler, kmeans])

model = pipeline.fit(d3)

d4 = model.transform(d3)

d4.groupBy("prediction") \
    .agg(functions.collect_set("loc").alias("loc")) \
    .orderBy("prediction").show(100, False)

WSSSE = model.stages[2].computeCost(d4)
print("Within Set Sum of Squared Errors = %d" % WSSSE)

print("Cluster Centers: ")
for v in model.stages[2].clusterCenters():
    print(v)

spark.stop
from pyspark.sql.functions import skewness, kurtosis
df.select(skewness("Quantity"), kurtosis("Quantity")).show()


# COMMAND ----------

from pyspark.sql.functions import corr, covar_pop, covar_samp
df.select(corr("InvoiceNo", "Quantity"), covar_samp("InvoiceNo", "Quantity"),
    covar_pop("InvoiceNo", "Quantity")).show()


# COMMAND ----------

from pyspark.sql.functions import collect_set, collect_list
df.agg(collect_set("Country"), collect_list("Country")).show()


# COMMAND ----------

from pyspark.sql.functions import count

df.groupBy("InvoiceNo").agg(
    count("Quantity").alias("quan"),
    expr("count(Quantity)")).show()


# COMMAND ----------

df.groupBy("InvoiceNo").agg(expr("avg(Quantity)"),expr("stddev_pop(Quantity)"))\
  .show()
Esempio n. 35
0
df_joined1.count()

# Descriptive Stats
df_joined1.describe().show(10,False)


####################################################################################################################
#
#   Model Prep
#
####################################################################################################################

order_list = df_order_products__train \
    .select(['order_id','product_id']) \
    .groupby("order_id") \
    .agg(collect_set("product_id")) \
    .withColumnRenamed('collect_set(product_id)','product_set')

order_list.show(20,False)


#(training, test) = df_joined1.randomSplit([0.8, 0.2])

####################################################################################################################
#
#   Train Model
#
####################################################################################################################

fpGrowth = FPGrowth(itemsCol="product_set", minSupport=0.01, minConfidence=0.05)
model    = fpGrowth.fit(order_list)