Ejemplo n.º 1
0
    def prepare_df(df):
        num_rows = df.count()

        # Expand dates.
        df = expand_date(df)

        df = df \
            .withColumn('Open', df.Open != '0') \
            .withColumn('Promo', df.Promo != '0') \
            .withColumn('StateHoliday', df.StateHoliday != '0') \
            .withColumn('SchoolHoliday', df.SchoolHoliday != '0')

        # Merge in store information.
        store = store_csv.join(store_states_csv, 'Store')
        df = df.join(store, 'Store')

        # Merge in Google Trend information.
        google_trend_all = prepare_google_trend()
        df = df.join(google_trend_all, ['State', 'Year', 'Week']).select(df['*'], google_trend_all.trend)

        # Merge in Google Trend for whole Germany.
        google_trend_de = google_trend_all[google_trend_all.file == 'Rossmann_DE']
        google_trend_de = google_trend_de.withColumnRenamed('trend', 'trend_de')
        df = df.join(google_trend_de, ['Year', 'Week']).select(df['*'], google_trend_de.trend_de)

        # Merge in weather.
        weather = weather_csv.join(state_names_csv, weather_csv.file == state_names_csv.StateName)
        df = df.join(weather, ['State', 'Date'])

        # Fix null values.
        df = df \
            .withColumn('CompetitionOpenSinceYear', F.coalesce(df.CompetitionOpenSinceYear, F.lit(1900))) \
            .withColumn('CompetitionOpenSinceMonth', F.coalesce(df.CompetitionOpenSinceMonth, F.lit(1))) \
            .withColumn('Promo2SinceYear', F.coalesce(df.Promo2SinceYear, F.lit(1900))) \
            .withColumn('Promo2SinceWeek', F.coalesce(df.Promo2SinceWeek, F.lit(1)))

        # Days & months competition was open, cap to 2 years.
        df = df.withColumn('CompetitionOpenSince',
                           F.to_date(F.format_string('%s-%s-15', df.CompetitionOpenSinceYear,
                                                     df.CompetitionOpenSinceMonth)))
        df = df.withColumn('CompetitionDaysOpen',
                           F.when(df.CompetitionOpenSinceYear > 1900,
                                  F.greatest(F.lit(0), F.least(F.lit(360 * 2), F.datediff(df.Date, df.CompetitionOpenSince))))
                           .otherwise(0))
        df = df.withColumn('CompetitionMonthsOpen', (df.CompetitionDaysOpen / 30).cast(T.IntegerType()))

        # Days & weeks of promotion, cap to 25 weeks.
        df = df.withColumn('Promo2Since',
                           F.expr('date_add(format_string("%s-01-01", Promo2SinceYear), (cast(Promo2SinceWeek as int) - 1) * 7)'))
        df = df.withColumn('Promo2Days',
                           F.when(df.Promo2SinceYear > 1900,
                                  F.greatest(F.lit(0), F.least(F.lit(25 * 7), F.datediff(df.Date, df.Promo2Since))))
                           .otherwise(0))
        df = df.withColumn('Promo2Weeks', (df.Promo2Days / 7).cast(T.IntegerType()))

        # Check that we did not lose any rows through inner joins.
        assert num_rows == df.count(), 'lost rows in joins'
        return df
Ejemplo n.º 2
0
    def test_least(self):
        df = self.spark.createDataFrame([(1, 4, 3)], ["a", "b", "c"])

        expected = [Row(least=1)]
        self.assertTrue(
            all([
                df.select(least(df.a, df.b,
                                df.c).alias("least")).collect() == expected,
                df.select(least(lit(3), lit(5),
                                lit(1)).alias("least")).collect() == expected,
                df.select(least("a", "b",
                                "c").alias("least")).collect() == expected,
            ]))
Ejemplo n.º 3
0
 def calc_min_max():
     if len(sdf.columns) > 1:
         min_col = F.least(*map(F.min, sdf))
         max_col = F.greatest(*map(F.max, sdf))
     else:
         min_col = F.min(sdf.columns[-1])
         max_col = F.max(sdf.columns[-1])
     return sdf.select(min_col, max_col).first()
Ejemplo n.º 4
0
def compile_least(t, expr, scope, **kwargs):
    op = expr.op()

    src_columns = t.translate(op.arg, scope)
    if len(src_columns) == 1:
        return src_columns[0]
    else:
        return F.least(*src_columns)
Ejemplo n.º 5
0
    def user_item_serendipity(self):
        """Calculate serendipity of each item in the recommendations for each user.
        The metric definition is based on the following references:

        :Citation:

            Y.C. Zhang, D.Ó. Séaghdha, D. Quercia and T. Jambor, Auralist:
            introducing serendipity into music recommendation, WSDM 2012

            Eugene Yan, Serendipity: Accuracy’s unpopular best friend in Recommender Systems,
            eugeneyan.com, April 2020

        Returns:
            pyspark.sql.dataframe.DataFrame: A dataframe with columns: col_user, col_item, user_item_serendipity.
        """
        # for every col_user, col_item in reco_df, join all interacted items from train_df.
        # These interacted items are repeated for each item in reco_df for a specific user.
        if self.df_user_item_serendipity is None:
            self.df_cosine_similarity = self._get_cosine_similarity()
            self.df_user_item_serendipity = (
                self.reco_df.select(
                    self.col_user,
                    self.col_item,
                    F.col(self.col_item).alias(
                        "reco_item_tmp"
                    ),  # duplicate col_item to keep
                )
                .join(
                    self.train_df.select(
                        self.col_user, F.col(self.col_item).alias("train_item_tmp")
                    ),
                    on=[self.col_user],
                )
                .select(
                    self.col_user,
                    self.col_item,
                    F.least(F.col("reco_item_tmp"), F.col("train_item_tmp")).alias(
                        "i1"
                    ),
                    F.greatest(F.col("reco_item_tmp"), F.col("train_item_tmp")).alias(
                        "i2"
                    ),
                )
                .join(self.df_cosine_similarity, on=["i1", "i2"], how="left")
                .fillna(0)
                .groupBy(self.col_user, self.col_item)
                .agg(F.mean(self.sim_col).alias("avg_item2interactedHistory_sim"))
                .join(self.reco_df, on=[self.col_user, self.col_item])
                .withColumn(
                    "user_item_serendipity",
                    (1 - F.col("avg_item2interactedHistory_sim"))
                    * F.col(self.col_relevance),
                )
                .select(self.col_user, self.col_item, "user_item_serendipity")
                .orderBy(self.col_user, self.col_item)
            )
        return self.df_user_item_serendipity
Ejemplo n.º 6
0
 def __fence(df, values):
     colname, (lfence, ufence) = list(values.items())[0]
     # Generates two columns, for lower and upper fences
     # and then applies `greatest` and `least` functions
     # to effectively fence the values.
     return (df.withColumn('__fence', F.lit(lfence))
             .withColumn(colname, F.greatest(colname, '__fence'))
             .withColumn('__fence', F.lit(ufence))
             .withColumn(colname, F.least(colname, '__fence'))
             .drop('__fence'))
Ejemplo n.º 7
0
def covisit(input_df, rate, min_count, unique_b):
    # Duplicate domain column to perform self-join

    if unique_b is False:

        input2_df = count.count_tuples(input_df).withColumnRenamed(
            'domain', 'domain2').withColumnRenamed('count', 'count2')

        # joined_df = self-join to get all domain combinations with same ip
        joined_df = (count.count_tuples(input_df).join(
            input2_df, 'ip',
            'inner').where(input_df.domain != input2_df.domain2).drop(
                'useragent', 'ssp', 'uuid'))

        count_df = joined_df.withColumn('number_covisitations',
                                        least('count', 'count2')).drop(
                                            'count', 'count2')

        count_df = count_df.groupBy(['domain',
                                     'domain2']).sum('number_covisitations')

        count_df = count_df.withColumnRenamed('sum(number_covisitations)',
                                              'number_covisitations')

        count_df = count_df.join(count.count_by_domain(input_df), 'domain', 'inner') \
            .withColumnRenamed('count', 'visits')

    else:
        input2_df = input_df.withColumnRenamed('domain', 'domain2')

        #Drop all duplicates to get only one tuple for each domain-domain2-ip
        joined_df = (input_df.join(
            input2_df, 'ip',
            'inner').where(input_df.domain != input2_df.domain2).drop(
                'useragent', 'ssp',
                'uuid').drop_duplicates(subset=('domain', 'domain2', 'ip')))

        count_df = (joined_df.groupBy(['domain',
                                       'domain2']).count().withColumnRenamed(
                                           'count', 'number_covisitations'))

        count_df = count_df.join(unique.count_unique_tuples(input_df), 'domain', 'inner') \
                    .withColumnRenamed('count', 'visits')

    # calculate co-visitation rate
    count_df = count_df.withColumn('covisit',
                                   col('number_covisitations') / col('visits'))
    count_df = count_df.where(count_df.covisit > rate).where(
        count_df.visits > min_count).drop('visits')

    count_df.show()

    return count_df
Ejemplo n.º 8
0
def Calculate_CCF(graph):
    iteration = 0
    done = False

    while not done:

        iteration += 1
        startPair = newPair.value

        # CCF-Iterate MAP
        ccf_iterate_map = graph.union(graph.select(col("value").alias("key"), col("key").alias("value")))

        # CCF-Iterate REDUCE
        ccf_iterate_reduce_pair = ccf_iterate_map.groupBy(col("key")).agg(collect_set("value").alias("value"))\
                                            .withColumn("min", least(col("key"), array_min("value")))\
                                            .filter((col('key')!=col('min')))

        newPair += ccf_iterate_reduce_pair.withColumn("count", size("value")-1).select(sum("count")).collect()[0][0]

        ccf_iterate_reduce = ccf_iterate_reduce_pair.select(col("min").alias("a_min"), concat(array(col("key")), col("value")).alias("valueList"))\
                                                    .withColumn("valueList", explode("valueList"))\
                                                    .filter((col('a_min')!=col('valueList')))\
                                                    .select(col('a_min').alias("key"), col('valueList').alias("value"))

        # CFF-Dedup MAP & REDUCE
        ccf_dedup_reduce = ccf_iterate_reduce.distinct()

        graph = ccf_dedup_reduce

        if startPair == newPair.value:
        done = True

        print("Itération : ", iteration, "Number of newPair : ", newPair.value)
    
    return graph

# MAIN #  
if __name__ == "__main__":

    sc = pyspark.SparkContext(appName="Spark_RDD")
    spark = SparkSession.builder.getOrCreate()
    newPair = sc.accumulator(0)
    
    dataset_path = "/user/user335/dataset/ccf"
    dataset = sc.textFile(dataset_path + "/web-Google.txt", use_unicode="False")

    graph = prepare_dataset(dataset)

    t1 = time.perf_counter()
    graph = Calculate_CCF(graph)
    t2 = time.perf_counter()

    print("calculation time (s) :", t2 - t1)
Ejemplo n.º 9
0
def quotient(primary_col: str, secondary_col: str, output_col: str,
             df: DataFrame):
    """The quotient is simply the minimum value divided by the maximum value
    Note that if the values are the same this will result in a score of 1.0,
    but if the values are very different this will result in scores close to 0.0"""

    return df.withColumn(
        output_col,
        F.when(
            F.col(primary_col).isNull() | F.col(secondary_col).isNull(),
            None).otherwise(
                F.least(F.col(primary_col), F.col(secondary_col)) /
                F.greatest(F.col(primary_col), F.col(secondary_col))),
    )
Ejemplo n.º 10
0
def test_least(data_gen):
    num_cols = 20
    s1 = gen_scalar(data_gen, force_no_nulls=True)
    # we want lots of nulls
    gen = StructGen(
        [('_c' + str(x), data_gen.copy_special_case(None, weight=100.0))
         for x in range(0, num_cols)],
        nullable=False)

    command_args = [f.col('_c' + str(x)) for x in range(0, num_cols)]
    command_args.append(s1)
    data_type = data_gen.data_type
    assert_gpu_and_cpu_are_equal_collect(
        lambda spark: gen_df(spark, gen).select(f.least(*command_args)))
Ejemplo n.º 11
0
    def get_bins(sdf, bins):
        # 'data' is a Spark DataFrame that selects all columns.
        if len(sdf.columns) > 1:
            min_col = F.least(*map(F.min, sdf))
            max_col = F.greatest(*map(F.max, sdf))
        else:
            min_col = F.min(sdf.columns[-1])
            max_col = F.max(sdf.columns[-1])
        boundaries = sdf.select(min_col, max_col).first()

        # divides the boundaries into bins
        if boundaries[0] == boundaries[1]:
            boundaries = (boundaries[0] - 0.5, boundaries[1] + 0.5)

        return np.linspace(boundaries[0], boundaries[1], bins + 1)
Ejemplo n.º 12
0
def persist_cabs(cabs):
    """Filter, calculate columns, partition on start time and cache cab df."""
    cabs = cabs \
        .filter(cabs.trip_tot < 500) \
        .select(['taxi', 'start_str', 'comm_pick', 'dur',
                 'dist', 'fare', 'tip', 'extra']) \
        .fillna(0, subset=['fare', 'tip', 'extra', 'dur', 'dist'])
    cabs = cabs \
        .withColumn('startrnd', sf.date_trunc("Hour",
            sf.to_timestamp(cabs.start_str, 'MM/dd/yyyy hh:mm:ss aa'))) \
        .withColumn('total', cabs.fare + cabs.tip + cabs.extra) \
        .drop('start_str', 'fare', 'tip', 'extra')
    cabs = cabs \
        .withColumn('permile',
            sf.when(cabs.dist > 0.2, sf.least(cabs.total / cabs.dist, sf.lit(20))) \
              .otherwise(sf.lit(4))) \
        .withColumn('permin',
            sf.when(cabs.dur > 1, sf.least(cabs.total / (cabs.dur/60), sf.lit(5))) \
              .otherwise(sf.lit(1))) \
        .drop('dur', 'dist')

    cabs = cabs.repartition(200, 'startrnd') \
        .persist(StorageLevel.MEMORY_AND_DISK_SER)
    return cabs
def initial_centroids(next_selected_cent, data_input, i):
    if i == k-1:
        

        data_cent6 = data_input.join(broadcast(next_selected_cent))
       
        data_cent7 = data_cent6.withColumn(str(i),squaree_spark1(data_cent6.columns[0],data_cent6.columns[1], data_cent6.columns[k+2],data_cent6.columns[k+3]))#+3 +4
       
        data_cent8 = data_cent7.drop('mindist').drop(data_cent7.columns[k+2]).drop(data_cent7.columns[k+3])
       
        return data_cent8
        
    else:
        

        data_cent6 = data_input.join(broadcast(next_selected_cent))

        
        data_cent7 = data_cent6.withColumn(str(i), squaree_spark1(data_cent6.columns[0],data_cent6.columns[1],data_cent6.columns[i+3], data_cent6.columns[i+4]))

        
        data_cent8 = data_cent7.drop(data_cent7.columns[i+3]).            drop(data_cent7.columns[i+4])
        
       
            
        data_cent9 = data_cent8.withColumn('mindist1',least(data_cent8.columns[i+3], col('mindist')))
       

        data_cent10 = data_cent9.drop('mindist')

        
        data_cent12 = data_cent10.withColumnRenamed('mindist1', 'mindist')
        
        data_cent13 = data_cent12.repartition(2001)
       
      
        next_cent_cache = data_cent13.orderBy(desc('mindist')).limit(1).cache()
        
        next_cent = next_cent_cache.select(data_cent12.columns[0:2])
        
        return next_cent, data_cent12
Ejemplo n.º 14
0
def scoreModelAgainstSamples(model, data_frame, cutoff_dist=20):
    # predict the y using the model and x samples, per sample, and sum the abs of the distances between the real y
    # with truncation of the error at distance cutoff_dist

    # from pyspark.sql import SparkSession
    # session = SparkSession(spark.sparkContext)
    #

    # sqlContext.getOrCreate(spark.sparkContext).sql('select sum(y) from x_y_table')

    total_score = 0

    df = data_frame.withColumn('pred_y',
                               model['a'] * data_frame['x'] + model['b'])
    df = df.withColumn(
        'score', F.least(F.abs((df['y'] - df['pred_y'])), F.lit(cutoff_dist)))
    df = df.agg({"score": "sum"})
    # df.explain()
    total_score = df.collect()[0][0]

    # def check_model(model, ):
    #     pred_y = model['a'] * sample['x'] + model['b']
    #     score = min(abs(sample['y'] - pred_y), 20)
    #     return
    #
    # rdd = data_frame.rdd
    # rdd_modeled = rdd.map(check_model(model))
    # rdd_total_score = rdd_modeled.reduce(lambda a, b: a + b)

    # totalScore = 0
    # for sample_i in range(0, len(samples) - 1):
    #     sample = samples[sample_i]
    #     pred_y = model['a'] * sample['x'] + model['b']
    #     score = min(abs(sample['y'] - pred_y), cutoff_dist)
    #
    #     totalScore += score

    # print("model ",model, " score ", totalScore)
    return total_score
def initial_centroids(next_selected,data_cent_5_persist, i):

   data_cent6 = data_cent_5_persist.join(broadcast(next_selected))


   data_cent6 = data_cent6.withColumn(str(i),squaree_spark1(data_cent6.columns[0],data_cent6.columns[1],
                                             data_cent6.columns[i+3],data_cent6.columns[i+4]))#+4 +5
   
   
   data_cent6 = data_cent6.drop(data_cent6.columns[i+3]).drop(data_cent6.columns[i+4])#+4 +5


   data_cent6 = data_cent6.withColumn('mindist1',least(data_cent6.columns[i+3], col('mindist')))#4

   data_cent6 = data_cent6.drop('mindist')
   
   data_cent6 = data_cent6.withColumnRenamed('mindist1','mindist')
   
   next_cent = data_cent6.orderBy(desc('mindist')).limit(1).select(data_cent6.columns[0:2])#1:3


   return next_cent,data_cent6
Ejemplo n.º 16
0
    def _get_datasets_by_gene(
        self, stats_results_df, observations_df, ontology_metadata_df, compress=True
    ):
        significance_cols = [
            "female_ko_effect_p_value",
            "male_ko_effect_p_value",
            "genotype_effect_p_value",
            "male_pvalue_low_vs_normal_high",
            "male_pvalue_low_normal_vs_high",
            "female_pvalue_low_vs_normal_high",
            "female_pvalue_low_normal_vs_high",
            "genotype_pvalue_low_normal_vs_high",
            "genotype_pvalue_low_vs_normal_high",
            "male_ko_effect_p_value",
            "female_ko_effect_p_value",
            "p_value",
            "effect_size",
            "male_effect_size",
            "female_effect_size",
            "male_effect_size_low_vs_normal_high",
            "male_effect_size_low_normal_vs_high",
            "genotype_effect_size_low_vs_normal_high",
            "genotype_effect_size_low_normal_vs_high",
            "female_effect_size_low_vs_normal_high",
            "female_effect_size_low_normal_vs_high",
            "significant",
            "full_mp_term",
            "metadata_group",
            "male_mutant_count",
            "female_mutant_count",
            "statistical_method",
            "mp_term_id",
            "top_level_mp_term_id",
            "top_level_mp_term_name",
            "sex",
        ]

        data_set_cols = [
            "allele_accession_id",
            "allele_symbol",
            "gene_symbol",
            "gene_accession_id",
            "parameter_stable_id",
            "parameter_name",
            "procedure_stable_id",
            "procedure_name",
            "pipeline_name",
            "pipeline_stable_id",
            "zygosity",
            "phenotyping_center",
            "life_stage_name",
        ]

        stats_results_df = stats_results_df.select(*(data_set_cols + significance_cols))
        stats_results_df = stats_results_df.withColumn(
            "selected_p_value",
            functions.when(
                functions.col("statistical_method").isin(
                    ["Manual", "Supplied as data"]
                ),
                functions.col("p_value"),
            )
            .when(
                functions.col("statistical_method").contains("Reference Range Plus"),
                functions.when(
                    functions.col("sex") == "male",
                    functions.least(
                        functions.col("male_pvalue_low_vs_normal_high"),
                        functions.col("male_pvalue_low_normal_vs_high"),
                    ),
                )
                .when(
                    functions.col("sex") == "female",
                    functions.least(
                        functions.col("female_pvalue_low_vs_normal_high"),
                        functions.col("female_pvalue_low_normal_vs_high"),
                    ),
                )
                .otherwise(
                    functions.least(
                        functions.col("genotype_pvalue_low_normal_vs_high"),
                        functions.col("genotype_pvalue_low_vs_normal_high"),
                    )
                ),
            )
            .otherwise(
                functions.when(
                    functions.col("sex") == "male",
                    functions.col("male_ko_effect_p_value"),
                )
                .when(
                    functions.col("sex") == "female",
                    functions.col("female_ko_effect_p_value"),
                )
                .otherwise(
                    functions.when(
                        functions.col("statistical_method").contains(
                            "Fisher Exact Test framework"
                        ),
                        functions.col("p_value"),
                    ).otherwise(functions.col("genotype_effect_p_value"))
                )
            ),
        )
        stats_results_df = stats_results_df.withColumn(
            "selected_p_value", functions.col("selected_p_value").cast(DoubleType())
        )
        stats_results_df = stats_results_df.withColumn(
            "selected_effect_size",
            functions.when(
                functions.col("statistical_method").isin(
                    ["Manual", "Supplied as data"]
                ),
                functions.lit(1.0),
            )
            .when(
                ~functions.col("statistical_method").contains("Reference Range Plus"),
                functions.when(
                    functions.col("sex") == "male", functions.col("male_effect_size")
                )
                .when(
                    functions.col("sex") == "female",
                    functions.col("female_effect_size"),
                )
                .otherwise(functions.col("effect_size")),
            )
            .otherwise(
                functions.when(
                    functions.col("sex") == "male",
                    functions.when(
                        functions.col("male_effect_size_low_vs_normal_high")
                        <= functions.col("male_effect_size_low_normal_vs_high"),
                        functions.col("genotype_effect_size_low_vs_normal_high"),
                    ).otherwise(
                        functions.col("genotype_effect_size_low_normal_vs_high")
                    ),
                )
                .when(
                    functions.col("sex") == "female",
                    functions.when(
                        functions.col("female_effect_size_low_vs_normal_high")
                        <= functions.col("female_effect_size_low_normal_vs_high"),
                        functions.col("genotype_effect_size_low_vs_normal_high"),
                    ).otherwise(
                        functions.col("genotype_effect_size_low_normal_vs_high")
                    ),
                )
                .otherwise(functions.col("effect_size"))
            ),
        )
        stats_results_df = stats_results_df.withColumn(
            "selected_phenotype_term", functions.col("mp_term_id")
        )
        observations_df = observations_df.select(*data_set_cols).distinct()
        datasets_df = observations_df.join(
            stats_results_df, data_set_cols, "left_outer"
        )
        datasets_df = datasets_df.groupBy(data_set_cols).agg(
            functions.collect_set(
                functions.struct(
                    *[
                        "selected_p_value",
                        "selected_effect_size",
                        "selected_phenotype_term",
                        "metadata_group",
                        "male_mutant_count",
                        "female_mutant_count",
                        "significant",
                        "top_level_mp_term_id",
                        "top_level_mp_term_name",
                    ]
                )
            ).alias("stats_data")
        )
        datasets_df = datasets_df.withColumn(
            "successful_stats_data",
            functions.expr(
                "filter(stats_data, stat -> stat.selected_p_value IS NOT NULL)"
            ),
        )
        datasets_df = datasets_df.withColumn(
            "stats_data",
            functions.when(
                functions.size("successful_stats_data") > 0,
                functions.sort_array("successful_stats_data").getItem(0),
            ).otherwise(functions.sort_array("stats_data").getItem(0)),
        )
        datasets_df = datasets_df.select(*data_set_cols, "stats_data.*")
        datasets_df = datasets_df.withColumnRenamed("selected_p_value", "p_value")
        datasets_df = datasets_df.withColumnRenamed(
            "selected_effect_size", "effect_size"
        )
        datasets_df = datasets_df.withColumnRenamed(
            "selected_phenotype_term", "phenotype_term_id"
        )
        datasets_df = datasets_df.withColumnRenamed(
            "top_level_mp_term_id", "top_level_phenotype_term_id"
        )

        datasets_df = datasets_df.withColumn(
            "top_level_mp_term_name",
            array_except(
                col("top_level_mp_term_name"), array(lit(None).cast("string"))
            ),
        )

        datasets_df = datasets_df.withColumnRenamed(
            "top_level_mp_term_name", "top_level_phenotype_term_name"
        )
        datasets_df = datasets_df.join(
            ontology_metadata_df, "phenotype_term_id", "left_outer"
        )
        datasets_df = datasets_df.withColumn(
            "significance",
            functions.when(
                functions.col("significant") == True, functions.lit("Significant")
            )
            .when(
                functions.col("p_value").isNotNull(), functions.lit("Not significant")
            )
            .otherwise(functions.lit("N/A")),
        )
        mgi_datasets_df = datasets_df.groupBy("gene_accession_id").agg(
            functions.collect_set(
                functions.struct(
                    *(
                        data_set_cols
                        + [
                            "significance",
                            "p_value",
                            "effect_size",
                            "metadata_group",
                            "male_mutant_count",
                            "female_mutant_count",
                            "phenotype_term_id",
                            "phenotype_term_name",
                            "top_level_phenotype_term_id",
                            "top_level_phenotype_term_name",
                        ]
                    )
                )
            ).alias("datasets_raw_data")
        )

        mgi_datasets_df = mgi_datasets_df.withColumnRenamed(
            "gene_accession_id", "mgi_accession_id"
        )

        if compress:
            to_json_udf = functions.udf(
                lambda row: None
                if row is None
                else json.dumps(
                    [
                        {key: value for key, value in item.asDict().items()}
                        for item in row
                    ]
                ),
                StringType(),
            )
            mgi_datasets_df = mgi_datasets_df.withColumn(
                "datasets_raw_data", to_json_udf("datasets_raw_data")
            )
            compress_and_encode = functions.udf(self._compress_and_encode, StringType())
            mgi_datasets_df = mgi_datasets_df.withColumn(
                "datasets_raw_data", compress_and_encode("datasets_raw_data")
            )
        return mgi_datasets_df
def amend_device_tracking(observations_df, tracking_df, last_updated_by):  # type: (DataFrame, DataFrame, str) -> typing.Tuple[DataFrame, DataFrame, DataFrame]
    """
    Blends new observations into an existing device tracking dataset.

    :param observations_df: New observations to be used for amending the device tracking data set.
    :param tracking_df: The device tracking data set.
    :param last_updated_by: The last updated user/process tracking field
    :return: A 3-tuple of (modified device tracking records only, updated full device tracking data set, device tracking records for never-before-seen devices only)
    """
    observations_df = observations_df.alias('o')
    tracking_df = tracking_df.alias('t')

    pk = ['organization', 'mac']

    # Find the tracking records that are changed by the new observations.

    delta_df = observations_df.select(
        'organization',
        'mac',
        'first_observed_at',
        'last_observed_at'
    ).join(
        tracking_df,
        on=pk,
        how='left'
    ).where(
        col('t.mac').isNull()
        | (col('o.first_observed_at') < col('t.first_observed_at'))
        | (col('o.last_observed_at') > col('t.last_observed_at'))
    ).select(
        'o.organization',
        'o.mac',
        least('o.first_observed_at', 't.first_observed_at').name('first_observed_at'),
        greatest('o.last_observed_at', 't.last_observed_at').name('last_observed_at'),
    ).cache()

    # Create a new version of the entire device tracking dataset, and checkpoint it to break the cyclic lineage
    # caused by reading from and writing to the same table.

    refresh_df = tracking_df.join(
        delta_df,
        on=pk,
        how='left_anti'  # Retain only the unmodified records.
    ).unionByName(
        delta_df.select(
            '*',
            current_timestamp().name('last_updated_at'),
            lit(last_updated_by).name('last_updated_by')
        )
    ).coalesce(
        1
    ).checkpoint(
        eager=True
    )

    # Find any never-before-seen devices.

    new_devices_df = delta_df.join(
        tracking_df,
        on=pk,
        how='left_anti'
    ).cache()

    return delta_df, refresh_df, new_devices_df
Ejemplo n.º 18
0
def run(w=14, l=114, threshold=0):
    # Creating Data Frame and filtering by threshold
    pattern, k = random_pattern(l, w), w

    Chiaromonte = [[91, -114, -31, -123], [-114, 100, -125, -31],
                   [-31, -125, 100, -114], [-123, -31, -114, 91]]

    spark = SparkSession.builder.appName('Distributed FSWM').getOrCreate()

    df = spark.read.text("data/example.fasta")

    # Read the sequences
    sequences = df.where(~df.value.contains('>')).rdd.map(list).map(
        lambda x: (x[0].encode('ascii'))).map(list)

    # Defining schema for data frame
    schema = StructType([
        StructField("id", IntegerType()),
        StructField("Sequence", ArrayType(StringType()))
    ])

    df = spark.createDataFrame(
        (tuple([_id, data[0]])
         for _id, data in enumerate(map(lambda x: [x], sequences.take(2)))),
        schema=schema)

    # Creating ngrams
    ngram = NGram(n=w, inputCol="Sequence", outputCol="ngrams")
    df_clean = ngram.transform(df).select(["id", "ngrams"])

    # Exploding ngrams into the data frame
    df_explode = df_clean.withColumn('ngrams', explode('ngrams'))

    # Defining the reducer
    # Create your UDF object (which accepts your python function called "my_udf")
    udf_object = udf(lambda y: reducer_concat(y), IntegerType())

    # Here we should have for all the sequences

    df_w0 = df_explode.where(df_clean.id == 0)
    df_w0 = df_w0.withColumn("id0",
                             monotonically_increasing_id() +
                             1).withColumnRenamed('ngrams',
                                                  'w0').select('id0', 'w0')
    df0 = df_w0.withColumn("word0",
                           udf_object(df_w0.w0)).select("id0", "word0")
    df0.show()

    df_w1 = df_explode.where(df_clean.id == 1)
    df_w1 = df_w1.withColumn("id1",
                             monotonically_increasing_id() +
                             1).withColumnRenamed('ngrams',
                                                  'w1').select('id1', 'w1')
    df1 = df_w1.withColumn("word1",
                           udf_object(df_w1.w1)).select("id1", "word1")
    df1.show(truncate=False)

    df_result = df0.crossJoin(df1) \
        .withColumn("spaced_word", udf_spaced_words(pattern)(col("word0"), col("word1"))) \
        .where(col("spaced_word").isNotNull()) \
        .withColumn("score", udf_score(pattern, k, Chiaromonte)(col("word0"), col("word1"))) \
        .where(col("score") > threshold) \
        .orderBy(["spaced_word", "score"], ascending=False) \
        .withColumn("min", least(col("id0"), col("id1"))) \
        .withColumn("max", greatest(col("id0"), col("id1"))) \
        .drop_duplicates(subset=["spaced_word", "min"]) \
        .drop_duplicates(subset=["spaced_word", "max"]) \
        .withColumn("JukesCantor", udf_jukes_cantor(pattern, k)(col("word0"), col("word1")))

    df_result.show()

    p = df_result.agg(suma("JukesCantor")).collect()[0][0] * 1.0 / (
        (k - bin(pattern).count("1") / 2) * df_result.count())

    print(JukesCantor(p))
def main(argv):
    """
    Solr Core loader
    :param list argv: the list elements should be:
                    [1]: source IMPC parquet file
                    [2]: Output Path
    """
    stats_results_parquet_path = argv[1]
    ontology_parquet_path = argv[2]
    output_path = argv[3]

    spark = SparkSession.builder.getOrCreate()
    stats_results_df = spark.read.parquet(stats_results_parquet_path)
    ontology_df = spark.read.parquet(ontology_parquet_path)

    genotype_phenotype_df = stats_results_df.where(col("significant") == True).select(
        GENOTYPE_PHENOTYPE_COLUMNS + STATS_RESULTS_COLUMNS
    )
    genotype_phenotype_df = genotype_phenotype_df.withColumn(
        "mp_term", explode_outer("full_mp_term")
    )
    genotype_phenotype_df = genotype_phenotype_df.withColumn("sex", col("mp_term.sex"))
    genotype_phenotype_df = genotype_phenotype_df.withColumn(
        "mp_term_id", col("mp_term.term_id")
    )

    genotype_phenotype_df = genotype_phenotype_df.join(
        ontology_df, col("mp_term_id") == col("id"), "left_outer"
    )

    for column_name, ontology_column in ONTOLOGY_STATS_MAP.items():
        genotype_phenotype_df = genotype_phenotype_df.withColumn(
            f"{column_name}", col(ontology_column)
        )

    genotype_phenotype_df = genotype_phenotype_df.withColumn(
        "p_value",
        when(
            col("statistical_method").isin(["Manual", "Supplied as data"]),
            col("genotype_effect_p_value"),
        )
        .when(
            col("statistical_method").contains("Reference Range Plus"),
            when(
                col("sex") == "male",
                least(
                    col("male_pvalue_low_vs_normal_high"),
                    col("male_pvalue_low_normal_vs_high"),
                ),
            )
            .when(
                col("sex") == "female",
                least(
                    col("female_pvalue_low_vs_normal_high"),
                    col("female_pvalue_low_normal_vs_high"),
                ),
            )
            .otherwise(col("genotype_effect_p_value")),
        )
        .otherwise(
            when(col("sex") == "male", col("male_ko_effect_p_value"))
            .when(col("sex") == "female", col("female_ko_effect_p_value"))
            .otherwise(
                when(
                    col("statistical_method").contains("Fisher Exact Test framework"),
                    col("p_value"),
                ).otherwise(col("genotype_effect_p_value"))
            )
        ),
    )

    genotype_phenotype_df = genotype_phenotype_df.withColumn(
        "effect_size",
        when(col("statistical_method").isin(["Manual", "Supplied as data"]), lit(1.0))
        .when(
            ~col("statistical_method").contains("Reference Range Plus"),
            when(col("sex") == "male", col("male_effect_size"))
            .when(col("sex") == "female", col("female_effect_size"))
            .otherwise(col("effect_size")),
        )
        .otherwise(
            when(
                col("sex") == "male",
                when(
                    col("male_effect_size_low_vs_normal_high")
                    <= col("male_effect_size_low_normal_vs_high"),
                    col("genotype_effect_size_low_vs_normal_high"),
                ).otherwise(col("genotype_effect_size_low_normal_vs_high")),
            )
            .when(
                col("sex") == "female",
                when(
                    col("female_effect_size_low_vs_normal_high")
                    <= col("female_effect_size_low_normal_vs_high"),
                    col("genotype_effect_size_low_vs_normal_high"),
                ).otherwise(col("genotype_effect_size_low_normal_vs_high")),
            )
            .otherwise(col("effect_size"))
        ),
    )

    genotype_phenotype_df = genotype_phenotype_df.withColumn(
        "percentage_change",
        when(col("sex") == "male", col("male_percentage_change"))
        .when(col("sex") == "female", col("female_percentage_change"))
        .otherwise(col("percentage_change")),
    )

    genotype_phenotype_df = genotype_phenotype_df.withColumn(
        "assertion_type_id",
        when(
            col("statistical_method").isin(["Manual", "Supplied as data"]),
            lit("ECO:0000218"),
        ).otherwise(lit("ECO:0000203")),
    )

    genotype_phenotype_df = genotype_phenotype_df.withColumn(
        "assertion_type",
        when(
            col("statistical_method").isin(["Manual", "Supplied as data"]),
            lit("manual"),
        ).otherwise(lit("automatic")),
    )

    genotype_phenotype_df = genotype_phenotype_df.select(
        GENOTYPE_PHENOTYPE_COLUMNS
        + list(ONTOLOGY_STATS_MAP.keys())
        + ["assertion_type_id", "assertion_type"]
    )
    genotype_phenotype_df = genotype_phenotype_df.withColumn(
        "doc_id", monotonically_increasing_id().astype(StringType())
    )
    genotype_phenotype_df.distinct().write.parquet(output_path)

#just for first round
#first time
i = 0

data_cent = data_spark_df.join(broadcast(df_centroid))

data_cent1 = data_cent.withColumn(str(i),squaree_spark1(data_cent.columns[0],data_cent.columns[1],
                                              data_cent.columns[2*i+2],data_cent.columns[2*i+3]))

data_cent2 = data_cent1.drop(data_cent1.columns[i+2]).drop(data_cent1.columns[i+3])

data_cent3 = data_cent2.withColumn('mindist',col(str(i)))

data_cent4 = data_cent3.withColumn('mindist1',least(data_cent3.columns[i+2], col('mindist')))

data_cent4 = data_cent4.drop('mindist')

data_cent5 = data_cent4.withColumnRenamed('mindist1','mindist')

next_selected = data_cent5.orderBy(desc('mindist')).limit(1).select(data_cent5.columns[0:2])#1:3


df_centroid = df_centroid.union(next_selected)

u = [str(i)+'x',str(i)+'y']
next_selected = next_selected.toDF(*u)


# In[28]:
Ejemplo n.º 21
0
   df3 = df2.withColumn('distZone3', ((df._3 - centroid3[0])**2 +
                                      (df._4 - centroid3[1])**2 +
                                      (df._5 - centroid3[2])**2)**0.5)
   df4 = df3.withColumn('distZone4', ((df._3 - centroid4[0])**2 +
                                      (df._4 - centroid4[1])**2 +
                                      (df._5 - centroid4[2])**2)**0.5)
   df5 = df4.select(df4._1 \
 , df4._2 \
 , df4._3 \
 , df4._4 \
 , df4._5 \
 , df4.distZone1 \
 , df4.distZone2 \
 , df4.distZone3 \
 , df4.distZone4 \
 , least("distZone1","distZone2","distZone3","distZone4").alias('minDis'))
   #assigning clusters
   df5 = df5.withColumn('prediction',when(df5.minDis == df5.distZone1,"Zone1") \
     .when(df5.minDis == df5.distZone2,"Zone2") \
     .when(df5.minDis == df5.distZone3,"Zone3") \
     .otherwise("Zone4"))
   #creating SQLContext and registering previous df as a table to build new centroids
   sqlcontext = SQLContext(spark)
   sqlcontext.registerDataFrameAsTable(df5, "df")
   #building new centroid1
   Zone1df = sqlcontext.sql(
       "SELECT  * FROM df WHERE df.prediction = 'Zone1'")
   Zone1dfmeans = Zone1df.select(mean(col("_3")).alias("shotclockmean") \
    , mean(col("_4")).alias("shotdistmean") \
    , mean(col("_5")).alias("closedefmean")).collect()
   shotclock = Zone1dfmeans[0]["shotclockmean"]
Ejemplo n.º 22
0
#### Column   ##################################################################
from pyspark.sql.functions import  (
concat, concat_ws, collect_list, collect_set, explode, 
explode_outer, flatten, greatest, least, posexplode, posexplode_outer, struct
)


concat(*col)      #  Concatenates multiple input columns together into a single column. The function works with strings, binary and
concat_ws(sep=";", *col)   #  speration Concatenates multiple input string columns together into a single string column, using the given separator.
collect_list   ##   df2.agg(collect_list('age')).collect()   Aggregate function: returns a list of objects with duplicates.
collect_set    ###  Aggregate function: returns a set of objects with duplicate elements eliminated.
explode   ## array --> column eDF.select(explode(eDF.intlist).alias("anInt")).collect()
explode_outer   ### array --> column  Unlike explode, if the array/map is null or empty then null
flatten   ## flatten array into flat  Collection function: creates a single array from an array of arrays
greatest   # Returns the greatest value of the list of column name df.select(greatest(df.a, df.b, df.c).alias("greatest")).collect()
least(col1, col2, col3)  # Returns the least value of the list of column names, skipping null values
posexplode(col )  # Returns a new row for each element with position in the given array or map.  eDF.select(posexplode(eDF.intlist)).collect()
posexplode_outer  ### explode array into new new row
struct  ## new struct columns,  df.select(struct('age', 'name').alias("struct")).collect()


#### Rows Agg operation  #######################################################
from pyspark.sql.functions import  (
grouping, grouping_id, first, last  )


grouping      #  df.cube("name").agg(grouping("name"), sum("age")).orderBy("name").show()
grouping_id   # df.cube("name").agg(grouping_id(), sum("age")).orderBy("name").show()  returns the level of grouping,
first   ###  1st row
last    ###  last row
def prepare_df(
    df: pyspark.sql.DataFrame,
    store_csv: pyspark.sql.DataFrame,
    store_states_csv: pyspark.sql.DataFrame,
    state_names_csv: pyspark.sql.DataFrame,
    google_trend_csv: pyspark.sql.DataFrame,
    weather_csv: pyspark.sql.DataFrame,
) -> pyspark.sql.DataFrame:
    num_rows = df.count()

    # expand dates
    df = expand_date(df)

    # create new columns in the DataFrame by filtering out special events(promo/holiday where sales was zero or store was closed).
    df = (df.withColumn("Open", df.Open != "0").withColumn(
        "Promo",
        df.Promo != "0").withColumn("StateHoliday",
                                    df.StateHoliday != "0").withColumn(
                                        "SchoolHoliday",
                                        df.SchoolHoliday != "0"))

    # merge store information
    store = store_csv.join(store_states_csv, "Store")
    df = df.join(store, "Store")

    # merge Google Trend information
    google_trend_all = prepare_google_trend(google_trend_csv)
    df = df.join(google_trend_all,
                 ["State", "Year", "Week"]).select(df["*"],
                                                   google_trend_all.trend)

    # merge in Google Trend for whole Germany
    google_trend_de = google_trend_all[google_trend_all.file ==
                                       "Rossmann_DE"].withColumnRenamed(
                                           "trend", "trend_de")
    df = df.join(google_trend_de,
                 ["Year", "Week"]).select(df["*"], google_trend_de.trend_de)

    # merge weather
    weather = weather_csv.join(state_names_csv,
                               weather_csv.file == state_names_csv.StateName)
    df = df.join(weather, ["State", "Date"])

    # fix null values
    df = (df.withColumn(
        "CompetitionOpenSinceYear",
        F.coalesce(df.CompetitionOpenSinceYear, F.lit(1900)),
    ).withColumn(
        "CompetitionOpenSinceMonth",
        F.coalesce(df.CompetitionOpenSinceMonth, F.lit(1)),
    ).withColumn("Promo2SinceYear",
                 F.coalesce(df.Promo2SinceYear, F.lit(1900))).withColumn(
                     "Promo2SinceWeek", F.coalesce(df.Promo2SinceWeek,
                                                   F.lit(1))))

    # days and months since the competition has been open, cap it to 2 years
    df = df.withColumn(
        "CompetitionOpenSince",
        F.to_date(
            F.format_string("%s-%s-15", df.CompetitionOpenSinceYear,
                            df.CompetitionOpenSinceMonth)),
    )
    df = df.withColumn(
        "CompetitionDaysOpen",
        F.when(
            df.CompetitionOpenSinceYear > 1900,
            F.greatest(
                F.lit(0),
                F.least(F.lit(360 * 2),
                        F.datediff(df.Date, df.CompetitionOpenSince)),
            ),
        ).otherwise(0),
    )
    df = df.withColumn("CompetitionMonthsOpen",
                       (df.CompetitionDaysOpen / 30).cast(T.IntegerType()))

    # days and weeks of promotion, cap it to 25 weeks
    df = df.withColumn(
        "Promo2Since",
        F.expr(
            'date_add(format_string("%s-01-01", Promo2SinceYear), (cast(Promo2SinceWeek as int) - 1) * 7)'
        ),
    )
    df = df.withColumn(
        "Promo2Days",
        F.when(
            df.Promo2SinceYear > 1900,
            F.greatest(
                F.lit(0),
                F.least(F.lit(25 * 7), F.datediff(df.Date, df.Promo2Since))),
        ).otherwise(0),
    )
    df = df.withColumn("Promo2Weeks",
                       (df.Promo2Days / 7).cast(T.IntegerType()))

    # ensure that no row was lost through inner joins
    assert num_rows == df.count(), "lost rows in joins"
    return df
        data_cent11 = data_spark_df.join(broadcast(next_selected))
        #         print(data_cent.count())

        data_cent11 = data_cent11.withColumn(
            str(i),
            squaree_spark1(data_cent11.columns[0], data_cent11.columns[1],
                           data_cent11.columns[i + 2],
                           data_cent11.columns[i + 3]))
        #         data_cent11.show()
        data_cent11 = data_cent11.drop(data_cent11.columns[i + 2]).drop(
            data_cent11.columns[i + 3])
        #         data_cent11.show()
        data_cent11 = data_cent11.withColumn('mindist', col(str(i)))
        #         data_cent11.show()
        data_cent11 = data_cent11.withColumn(
            'mindist1', least(data_cent11.columns[i + 2], col('mindist')))
        #         data_cent11.show()
        data_cent11 = data_cent11.drop('mindist')
        data_cent11 = data_cent11.withColumnRenamed('mindist1', 'mindist')
    elif i > 0:
        data_cent11 = data_cent11.join(broadcast(next_selected))
        #         data_cent11.show()
        data_cent11 = data_cent11.withColumn(
            str(i),
            squaree_spark1(data_cent11.columns[0], data_cent11.columns[1],
                           data_cent11.columns[i + 3],
                           data_cent11.columns[i + 4]))
        #         data_cent11.show()
        data_cent11 = data_cent11.drop(u[0]).drop(u[1])
        #         data_cent11.show()
        data_cent11 = data_cent11.withColumn('mindist1',
dataCol = data_spark_df.columns
for i in range(k):
    if i == 0:
        centerOfiTH = clusters.centers[i].tolist(
        )  #for example, the entry against which you want distances
        distance_udf = F.udf(
            lambda x: float(
                distance.euclidean([float(z) for z in x], centerOfiTH)),
            FloatType())
        columns = [F.col(c) for c in dataCol]
        data_cent = data_spark_df.withColumn('dis' + str(i) + 'th',
                                             distance_udf(F.array(columns)))
        data_cent = data_cent.withColumn('mindist', col('dis' + str(i) + 'th'))
        data_cent
        data_cent = data_cent.withColumn(
            'mindist1', least(col('dis' + str(i) + 'th'), col('mindist')))
        data_cent = data_cent.drop('mindist')
        #     .drop('dis' + str(i) + 'th')
        data_cent = data_cent.withColumnRenamed('mindist1', 'mindist')
    elif i > 0:
        centerOfiTH = clusters.centers[i].tolist(
        )  #for example, the entry against which you want distances
        distance_udf = F.udf(
            lambda x: float(
                distance.euclidean([float(z) for z in x], centerOfiTH)),
            FloatType())
        columns = [F.col(c) for c in dataCol]
        data_cent = data_cent.withColumn('dis' + str(i) + 'th',
                                         distance_udf(F.array(columns)))
        data_cent = data_cent.withColumn('mindist1',
                                         least(col('dis' + str(i) + 'th'),
Ejemplo n.º 26
0
def fillspark(hist, df):
    import pyspark.sql.functions as fcns

    indexes = []
    for axis in hist._group + hist._fixed:
        exprcol = tocolumns(df, histbook.instr.totree(axis._parsed))

        if isinstance(axis, histbook.axis.groupby):
            indexes.append(exprcol)

        elif isinstance(axis, histbook.axis.groupbin):
            scaled = (exprcol - float(axis.origin)) * (1.0 /
                                                       float(axis.binwidth))
            if axis.closedlow:
                discretized = fcns.floor(scaled)
            else:
                discretized = fcns.ceil(scaled) - 1
            indexes.append(
                fcns.nanvl(
                    discretized * float(axis.binwidth) + float(axis.origin),
                    fcns.lit("NaN")))

        elif isinstance(axis, histbook.axis.bin):
            scaled = (exprcol -
                      float(axis.low)) * (int(axis.numbins) /
                                          (float(axis.high) - float(axis.low)))
            if axis.closedlow:
                discretized = fcns.floor(scaled) + 1
            else:
                discretized = fcns.ceil(scaled)
            indexes.append(
                fcns.when(
                    fcns.isnull(exprcol) | fcns.isnan(exprcol),
                    int(axis.numbins) + 2).otherwise(
                        fcns.greatest(
                            fcns.lit(0),
                            fcns.least(fcns.lit(int(axis.numbins) + 1),
                                       discretized))))

        elif isinstance(axis, histbook.axis.intbin):
            indexes.append(
                fcns.greatest(
                    fcns.lit(0),
                    fcns.least(fcns.lit(int(axis.max) - int(axis.min) + 1),
                               fcns.round(exprcol - int(axis.min) + 1))))

        elif isinstance(axis, histbook.axis.split):

            def build(x, i):
                if i < len(axis.edges):
                    if axis.closedlow:
                        return build(x.when(exprcol < float(axis.edges[i]), i),
                                     i + 1)
                    else:
                        return build(
                            x.when(exprcol <= float(axis.edges[i]), i), i + 1)
                else:
                    return x.otherwise(i)

            indexes.append(
                build(
                    fcns.when(
                        fcns.isnull(exprcol) | fcns.isnan(exprcol),
                        len(axis.edges) + 1), 0))

        elif isinstance(axis, histbook.axis.cut):
            indexes.append(fcns.when(exprcol, 0).otherwise(1))

        else:
            raise AssertionError(axis)

    aliasnum = [-1]

    def alias(x):
        aliasnum[0] += 1
        return x.alias("@" + str(aliasnum[0]))

    index = alias(fcns.struct(*indexes))

    selectcols = [index]
    if hist._weightoriginal is not None:
        weightcol = tocolumns(df, histbook.instr.totree(hist._weightparsed))
    for axis in hist._profile:
        exprcol = tocolumns(df, histbook.instr.totree(axis._parsed))
        if hist._weightoriginal is None:
            selectcols.append(alias(exprcol))
            selectcols.append(alias(exprcol * exprcol))
        else:
            selectcols.append(alias(exprcol * weightcol))
            selectcols.append(alias(exprcol * exprcol * weightcol))

    if hist._weightoriginal is None:
        df2 = df.select(*selectcols)
    else:
        selectcols.append(alias(weightcol))
        selectcols.append(alias(weightcol * weightcol))
        df2 = df.select(*selectcols)

    aggs = [fcns.sum(df2[n]) for n in df2.columns[1:]]
    if hist._weightoriginal is None:
        aggs.append(fcns.count(df2[df2.columns[0]]))

    def getornew(content, key, nextaxis):
        if key in content:
            return content[key]
        elif isinstance(nextaxis, histbook.axis.GroupAxis):
            return {}
        else:
            return numpy.zeros(hist._shape, dtype=histbook.hist.COUNTTYPE)

    def recurse(index, columns, axis, content):
        if len(axis) == 0:
            content += columns

        elif isinstance(axis[0],
                        (histbook.axis.groupby, histbook.axis.groupbin)):
            content[index[0]] = recurse(
                index[1:], columns, axis[1:],
                getornew(content, index[0],
                         axis[1] if len(axis) > 1 else None))
            if isinstance(axis[0], histbook.axis.groupbin) and None in content:
                content["NaN"] = content[None]
                del content[None]

        elif isinstance(
                axis[0],
            (histbook.axis.bin, histbook.axis.intbin, histbook.axis.split)):
            i = index[0] - (1 if not axis[0].underflow else 0)
            if int(i) < axis[0].totbins:
                recurse(index[1:], columns, axis[1:], content[int(i)])

        elif isinstance(axis[0], histbook.axis.cut):
            recurse(index[1:], columns, axis[1:],
                    content[0 if index[0] else 1])

        else:
            raise AssertionError(axis[0])

        return content

    query = df2.groupBy(df2[df2.columns[0]]).agg(*aggs)

    def wait():
        for row in query.collect():
            recurse(row[0], row[1:], hist._group + hist._fixed, hist._content)

    return wait
Ejemplo n.º 27
0
def main():

    ## Building Spark Session and Reading Data into DataFrames
    spark = SparkSession.builder.appName("InterviewAnswers").getOrCreate()
    sc = spark.sparkContext
    # sc.addPyFile("dependencies.zip")
    df = spark.read.format('csv').options(
        header='true', inferSchema='true').load('/tmp/data/DataSample.csv')
    poi_t = spark.read.format('csv').options(
        header='true', inferSchema='true').load('/tmp/data/POIList.csv')

    # Question 1:
    # dropping he duplicated rows in both CSV files the Sample Data (duplicates in time stamp /Latitude / Longitude)  as well as the point of interest data ( duplicates in Latitude / Longitude)
    poi = poi_t.dropDuplicates([' Latitude', 'Longitude'])
    df1 = df.dropDuplicates([' TimeSt', 'Latitude', 'Longitude'])
    poi = poi.select(
        col(" Latitude").alias("poi_lat"),
        col("Longitude").alias("poi_long"),
        col("POIID").alias("POIID"))

    # Question 2:

    def calculate(long, poi_long, lat, poi_lat):
        """
    Args: 
    An object containing longitude of a data sample
    An object containing the longitude of the point of interest
    An object containing latitude of a data sample
    An object containing the latitude of the point of interest
    
    returns:
    the distance from the data sample to the point of interest
    """

        long_data = float(long)
        long_poi = float(poi_long)
        lat_data = float(lat)
        lat_poi = float(poi_lat)
        lat_diff = (lat_data - lat_poi) * (lat_data - lat_poi)
        long_diff = (long_data - long_poi) * (long_data - long_poi)

        return math.sqrt(lat_diff + long_diff)

    # Question 2
    # Cross joining the point of interest locations to the data samples to calculate
    # the distance between each sample point from the POI
    df_poi = df1.crossJoin(poi)
    calculate_udf = udf(calculate)
    df_poi = df_poi.select(
        "*",
        calculate_udf("Longitude", "poi_long", "Latitude",
                      "poi_lat").alias('Distance'))
    df_poi = df_poi.select("*", df_poi.Distance.cast('float').alias('Dist'))
    dd = df_poi.groupby('_ID', 'Latitude', 'Longitude', 'Country', 'Province',
                        'City', ' TimeSt').pivot('POIID').min("Dist")

    def get_list(a):
        """
    Args:
    A Row Object
    
    Returns:
    A list containing the values in the row
    """
        df = a
        l = []
        a = []
        for r in df:
            a.append(r.asDict())
        for i in a:
            for k, v in i.items():
                l.append(v)

        return l

    POIs = poi.sort("POIID").select("POIID").distinct().collect()
    points = get_list(POIs)

    dd3 = dd.select("*", least(*points).alias('min'))

    def find_index(*args):
        point = len(args)
        minimum = float(args[-1])
        for i in range(len(args) - 1):
            if ((float(args[i])) == minimum):
                point = i
        return point

    def find_poi(index, poi_list):
        a = int(index)
        return poi_list[a]

    def udf_find_poi(label_list):
        return udf(lambda l: find_poi(l, poi_list))

    poi_list = points

    aa = copy.deepcopy(points)
    aa.append("min")

    find_index_udf = udf(find_index)
    d_poi_temp = dd3.select("*", find_index_udf(*aa).alias('Index'))

    # column POINT of dataframe d_poi shows the nearest point of interest to the data point
    d_poi = d_poi_temp.select(
        "*",
        udf_find_poi(poi_list)(col("index")).alias("POINT"))

    ##Question 3 - Part 1:

    d_mean = d_poi.groupby("POINT").mean().sort("POINT")
    means = d_mean.select("avg(min)").collect()
    m = get_list(means)
    radius = d_poi.groupby("POINT").max().sort("POINT")
    counts = d_poi.groupby("POINT").count().sort("POINT")

    rr = radius.select("max(min)").collect()
    r = get_list(rr)

    stds = d_poi.groupby("POINT").agg(stddev("min")).sort("POINT")
    st = stds.select("stddev_samp(min)").collect()
    std = get_list(st)
    q4 = d_poi.join(stds, on="POINT", how="left")
    # q4_2 = q4.join(d_mean, on="POINT", how="left")

    poi_lat = get_list(poi.sort("POIID").select("poi_lat").collect())
    poi_long = get_list(poi.sort("POIID").select("poi_long").collect())

    colors = ['r', 'blue', 'g']
    # marker=['+', '.', '<']
    z = [5, 10, 15]
    theta = range(1, int(20000 * math.pi), 1)
    for i in range(len(std)):
        poi_plot = d_poi.where(col("POINT") == points[i])
        y = [val.Latitude for val in poi_plot.select('Latitude').collect()]
        x = [val.Longitude for val in poi_plot.select('Longitude').collect()]
        plt.plot(x, y, '.', markersize=1, color=colors[i])
        plt.plot(poi_long[i], poi_lat[i], 's', markersize=5, color=colors[i])
        a = [math.cos(x / 20000) * r[i]**2 + poi_long[i] for x in theta]
        b = [math.sin(x / 20000) * r[i]**2 + poi_lat[i] for x in theta]
        c = [-1 * math.sin(x / 20000) * r[i]**2 + poi_lat[i] for x in theta]
        plt.plot(a, b, color=colors[i])
        plt.plot(a, c, color=colors[i])
        plt.grid()
        plt.xlabel("Longitude")
        plt.ylabel("Latitude")
        plt.title(points[i])
        plt.savefig('/tmp/data/poi_density_{}'.format(points[i]))
        plt.cla()

    ## Question 3 - Part 2
    ## finding density
    requests = d_poi.groupby("POINT").count().sort("POINT")

    def find_area(radius):
        r = float(radius)
        area = 3.14159 * r**2
        return area

    find_area_udf = udf(find_area)
    areas = radius.select("*", find_area_udf("max(min)").alias('area'))
    den = areas.join(counts, on="POINT", how="left")

    def find_density(area, count):
        a = float(area)
        c = float(count)
        den = c / a
        return den

    density_udf = udf(find_density)
    dense = den.select("*", density_udf("area", "count").alias("density"))
    density_poi = dense.select("density").collect()
    d = get_list(density_poi)
    densities = {}

    for i in range(len(d)):
        densities[points[i]] = d[i]

    print('Request densities: ', densities)

    ## Question 4
    q_outlier = stds.join(d_mean, on="POINT", how="left")
    q_outlier = q_outlier.select(
        col("POINT").alias("POINT"),
        col("stddev_samp(min)").alias("stddev"),
        col("avg(min)").alias("mean"))

    q4 = d_poi.join(q_outlier, on="POINT", how="left")

    poi = poi.select(
        col("poi_lat").alias("poi_lat"),
        col("poi_long").alias("poi_long"),
        col("POIID").alias("POINT"))
    q4 = q4.join(poi, on="POINT", how="left")

    q4 = q4.select(
        col("POINT").alias("POINT"),
        col("_ID").alias("ID"),
        col("poi_lat").alias("poi_lat"),
        col("poi_long").alias("poi_long"),
        col("Latitude").alias("Latitude"),
        col("Longitude").alias("Longitude"),
        col("Country").alias("Country"),
        col("Province").alias("Provice"),
        col("City").alias("City"),
        col(" TimeSt").alias("TimeSt"),
        col("stddev").alias("stddev"),
        col("min").alias("distance"),
        col("mean").alias("mean"))

    def outlier(dist, stddev, mean):
        d = float(dist)
        std = float(stddev)
        m = float(mean)
        if (d <= (mean + 2 * std) and d >= (mean - 2 * std)):
            a = 0
        else:
            a = 1
        return a

    outlier_udf = udf(outlier)
    q4_outlier = q4.select(
        "*",
        outlier_udf("distance", "stddev", "mean").alias("outlier"))

    d_non_outlier = q4_outlier.where(col("outlier") == 0)

    print("There were {0} datas before removing outliers "
          "and after removing outliers there are: {1} datas".format(
              q4.count(), d_non_outlier.count()))

    poi_min = d_non_outlier.groupby("POINT").min().sort("POINT")
    poi_min = poi_min.select(col("POINT"), col("min(Latitude)"),
                             col("min(Longitude)"))

    poi_max = d_non_outlier.groupby("POINT").max().sort("POINT")
    poi_max = poi_max.select(col("POINT"), col("max(Latitude)"),
                             col("max(Longitude)"))

    poi_scale_step1 = poi_min.join(poi_max, on="POINT", how="left")
    poi_scale_step2 = d_non_outlier.join(poi_scale_step1,
                                         on="POINT",
                                         how="left")

    def scaling_latitude(lat, min_lat, max_lat):
        lat = float(lat)
        min_lat = float(min_lat)
        max_lat = float(max_lat)
        scaled = (lat - min_lat) / (max_lat - min_lat) * (20) - 10
        return scaled

    def scaling_longitude(long, min_long, max_long):
        long = float(long)
        min_long = float(min_long)
        max_long = float(max_long)
        scaled = (long - min_long) / (max_long - min_long) * (20) - 10
        return scaled

    scaling_lat_udf = udf(scaling_latitude)
    scaled_q4 = poi_scale_step2.select(
        "*",
        scaling_lat_udf("Latitude", "min(Latitude)",
                        "max(Latitude)").alias('lat_scaled'))
    scaled_q4 = scaled_q4.select(
        "*",
        scaling_lat_udf("poi_lat", "min(Latitude)",
                        "max(Latitude)").alias("poi_lat_scaled"))

    scaling_long_udf = udf(scaling_longitude)
    scaled_q4 = scaled_q4.select(
        "*",
        scaling_long_udf("Longitude", "min(Longitude)",
                         "max(Longitude)").alias('long_scaled'))
    scaled_q4 = scaled_q4.select(
        "*",
        scaling_long_udf("poi_long", "min(Longitude)",
                         "max(Longitude)").alias("poi_long_scaled"))

    # plot the final scaled dataframe
    plt.cla()
    for i in range(len(points)):
        poi_plot_scaled = scaled_q4.where(col("POINT") == points[i])
        y = get_list(poi_plot_scaled.select('lat_scaled').collect())
        x = get_list(poi_plot_scaled.select('long_scaled').collect())
        plt.plot(x, y, '.', markersize=1, color=colors[i])
        plt.xlabel("Scaled Longitude")
        plt.ylabel("Scaled Latitude")
        plt.axis("image")
        plt.tick_params(axis='both',
                        left='off',
                        top='off',
                        right='off',
                        bottom='off',
                        labelleft='off',
                        labeltop='off',
                        labelright='off',
                        labelbottom='off')
        plt.title(points[i])
        plt.savefig("/tmp/data/scaled_data_{}.png".format(points[i]))
        plt.cla()

    print('Request POI densities: ', densities)
Ejemplo n.º 28
0
def tocolumns(df, expr):
    import pyspark.sql.functions as fcns

    if isinstance(expr, histbook.expr.Const):
        return fcns.lit(expr.value)

    elif isinstance(expr, (histbook.expr.Name, histbook.expr.Predicate)):
        return df[expr.value]

    elif isinstance(expr, histbook.expr.Call):
        if expr.fcn == "abs" or expr.fcn == "fabs":
            return fcns.abs(tocolumns(df, expr.args[0]))
        elif expr.fcn == "max" or expr.fcn == "fmax":
            return fcns.greatest(*[tocolumns(df, x) for x in expr.args])
        elif expr.fcn == "min" or expr.fcn == "fmin":
            return fcns.least(*[tocolumns(df, x) for x in expr.args])
        elif expr.fcn == "arccos":
            return fcns.acos(tocolumns(df, expr.args[0]))
        elif expr.fcn == "arccosh":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "arcsin":
            return fcns.asin(tocolumns(df, expr.args[0]))
        elif expr.fcn == "arcsinh":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "arctan2":
            return fcns.atan2(tocolumns(df, expr.args[0]),
                              tocolumns(df, expr.args[1]))
        elif expr.fcn == "arctan":
            return fcns.atan(tocolumns(df, expr.args[0]))
        elif expr.fcn == "arctanh":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "ceil":
            return fcns.ceil(tocolumns(df, expr.args[0]))
        elif expr.fcn == "copysign":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "cos":
            return fcns.cos(tocolumns(df, expr.args[0]))
        elif expr.fcn == "cosh":
            return fcns.cosh(tocolumns(df, expr.args[0]))
        elif expr.fcn == "rad2deg":
            return tocolumns(df, expr.args[0]) * (180.0 / math.pi)
        elif expr.fcn == "erfc":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "erf":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "exp":
            return fcns.exp(tocolumns(df, expr.args[0]))
        elif expr.fcn == "expm1":
            return fcns.expm1(tocolumns(df, expr.args[0]))
        elif expr.fcn == "factorial":
            return fcns.factorial(tocolumns(df, expr.args[0]))
        elif expr.fcn == "floor":
            return fcns.floor(tocolumns(df, expr.args[0]))
        elif expr.fcn == "fmod":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "gamma":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "hypot":
            return fcns.hypot(tocolumns(df, expr.args[0]),
                              tocolumns(df, expr.args[1]))
        elif expr.fcn == "isinf":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "isnan":
            return fcns.isnan(tocolumns(df, expr.args[0]))
        elif expr.fcn == "lgamma":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "log10":
            return fcns.log10(tocolumns(df, expr.args[0]))
        elif expr.fcn == "log1p":
            return fcns.log1p(tocolumns(df, expr.args[0]))
        elif expr.fcn == "log":
            return fcns.log(tocolumns(df, expr.args[0]))
        elif expr.fcn == "pow":
            return fcns.pow(tocolumns(df, expr.args[0]),
                            tocolumns(df, expr.args[1]))
        elif expr.fcn == "deg2rad":
            return tocolumns(df, expr.args[0]) * (math.pi / 180.0)
        elif expr.fcn == "sinh":
            return fcns.sinh(tocolumns(df, expr.args[0]))
        elif expr.fcn == "sin":
            return fcns.sin(tocolumns(df, expr.args[0]))
        elif expr.fcn == "sqrt":
            return fcns.sqrt(tocolumns(df, expr.args[0]))
        elif expr.fcn == "tanh":
            return fcns.tanh(tocolumns(df, expr.args[0]))
        elif expr.fcn == "tan":
            return fcns.tan(tocolumns(df, expr.args[0]))
        elif expr.fcn == "trunc":
            raise NotImplementedError(
                expr.fcn)  # FIXME (fcns.trunc is for dates)
        elif expr.fcn == "xor":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "conjugate":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "exp2":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "heaviside":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "isfinite":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "left_shift" and isinstance(expr.args[1],
                                                     histbook.expr.Const):
            return fcns.shiftLeft(tocolumns(df, expr.args[0]),
                                  expr.args[1].value)
        elif expr.fcn == "log2":
            return fcns.log2(tocolumns(df, expr.args[0]))
        elif expr.fcn == "logaddexp2":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "logaddexp":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "mod" or expr.fcn == "fmod":
            return tocolumns(df, expr.args[0]) % tocolumns(df, expr.args[1])
        elif expr.fcn == "right_shift" and isinstance(expr.args[1],
                                                      histbook.expr.Const):
            return fcns.shiftRight(tocolumns(df, expr.args[0]),
                                   expr.args[1].value)
        elif expr.fcn == "rint":
            return fcns.rint(tocolumns(df, expr.args[0]))
        elif expr.fcn == "sign":
            raise NotImplementedError(expr.fcn)  # FIXME
        elif expr.fcn == "where":
            return fcns.when(tocolumns(df, expr.args[0]),
                             tocolumns(df, expr.args[1])).otherwise(
                                 tocolumns(df, expr.args[2]))
        elif expr.fcn == "numpy.equal":
            return tocolumns(df, expr.args[0]) == tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.not_equal":
            return tocolumns(df, expr.args[0]) != tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.less":
            return tocolumns(df, expr.args[0]) < tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.less_equal":
            return tocolumns(df, expr.args[0]) <= tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.isin":
            return tocolumns(df, expr.args[0]) in tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.logical_not":
            return ~tocolumns(df, expr.args[0])
        elif expr.fcn == "numpy.add":
            return tocolumns(df, expr.args[0]) + tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.subtract":
            return tocolumns(df, expr.args[0]) - tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.multiply":
            return tocolumns(df, expr.args[0]) * tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.true_divide":
            return tocolumns(df, expr.args[0]) / tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.logical_or":
            return tocolumns(df, expr.args[0]) | tocolumns(df, expr.args[1])
        elif expr.fcn == "numpy.logical_and":
            return tocolumns(df, expr.args[0]) & tocolumns(df, expr.args[1])
        else:
            raise NotImplementedError(expr.fcn)

    else:
        raise AssertionError(expr)
Ejemplo n.º 29
0
    def main(self, sc: SparkContext, *args: Any):
        """
        Takes in the stats results parquet, joins it with the ontology data and returns the significant genotype phenotype associations.
        """
        stats_results_parquet_path = args[0]
        ontology_parquet_path = args[1]
        output_path = args[2]

        spark = SparkSession.builder.getOrCreate()
        stats_results_df = spark.read.parquet(stats_results_parquet_path)
        ontology_df = spark.read.parquet(ontology_parquet_path)

        genotype_phenotype_df = stats_results_df.where(
            col("significant") == True).select(GENOTYPE_PHENOTYPE_COLUMNS +
                                               STATS_RESULTS_COLUMNS)
        genotype_phenotype_df = genotype_phenotype_df.withColumn(
            "mp_term", explode_outer("full_mp_term"))
        genotype_phenotype_df = genotype_phenotype_df.withColumn(
            "sex", col("mp_term.sex"))
        genotype_phenotype_df = genotype_phenotype_df.withColumn(
            "mp_term_id", col("mp_term.term_id"))
        genotype_phenotype_df = genotype_phenotype_df.withColumn(
            "mp_term_id", regexp_replace("mp_term_id", " ", ""))
        for bad_mp in BAD_MP_MAP.keys():
            genotype_phenotype_df = genotype_phenotype_df.withColumn(
                "mp_term_id",
                when(col("mp_term_id") == bad_mp,
                     lit(BAD_MP_MAP[bad_mp])).otherwise(col("mp_term_id")),
            )

        genotype_phenotype_df = genotype_phenotype_df.join(
            ontology_df,
            col("mp_term_id") == col("id"), "left_outer")

        for column_name, ontology_column in ONTOLOGY_STATS_MAP.items():
            genotype_phenotype_df = genotype_phenotype_df.withColumn(
                f"{column_name}", col(ontology_column))

        genotype_phenotype_df = genotype_phenotype_df.withColumn(
            "p_value",
            when(
                col("statistical_method").isin(["Manual", "Supplied as data"])
                | (col("resource_name") == "pwg"),
                col("p_value"),
            ).when(
                col("statistical_method").contains("Reference Range Plus"),
                when(
                    col("sex") == "male",
                    least(
                        col("male_pvalue_low_vs_normal_high"),
                        col("male_pvalue_low_normal_vs_high"),
                    ),
                ).when(
                    col("sex") == "female",
                    least(
                        col("female_pvalue_low_vs_normal_high"),
                        col("female_pvalue_low_normal_vs_high"),
                    ),
                ).otherwise(
                    least(
                        col("genotype_pvalue_low_normal_vs_high"),
                        col("genotype_pvalue_low_vs_normal_high"),
                    )),
            ).otherwise(
                when(col("sex") == "male", col("male_ko_effect_p_value")).when(
                    col("sex") == "female",
                    col("female_ko_effect_p_value")).otherwise(
                        when(
                            col("statistical_method").contains(
                                "Fisher Exact Test framework"),
                            col("p_value"),
                        ).otherwise(col("genotype_effect_p_value")))),
        )

        genotype_phenotype_df = genotype_phenotype_df.withColumn(
            "effect_size",
            when(
                col("statistical_method").isin(["Manual", "Supplied as data"]),
                lit(1.0)).
            when(col("resource_name") == "pwg", col("effect_size")).when(
                ~col("statistical_method").contains("Reference Range Plus"),
                when(col("sex") == "male", col("male_effect_size")).when(
                    col("sex") == "female",
                    col("female_effect_size")).otherwise(col("effect_size")),
            ).otherwise(
                when(
                    col("sex") == "male",
                    when(
                        col("male_effect_size_low_vs_normal_high") <=
                        col("male_effect_size_low_normal_vs_high"),
                        col("genotype_effect_size_low_vs_normal_high"),
                    ).otherwise(
                        col("genotype_effect_size_low_normal_vs_high")),
                ).when(
                    col("sex") == "female",
                    when(
                        col("female_effect_size_low_vs_normal_high") <=
                        col("female_effect_size_low_normal_vs_high"),
                        col("genotype_effect_size_low_vs_normal_high"),
                    ).otherwise(
                        col("genotype_effect_size_low_normal_vs_high")),
                ).otherwise(col("effect_size"))),
        )

        genotype_phenotype_df = genotype_phenotype_df.withColumn(
            "percentage_change",
            when(col("sex") == "male", col("male_percentage_change")).when(
                col("sex") == "female",
                col("female_percentage_change")).otherwise(
                    col("percentage_change")),
        )

        genotype_phenotype_df = genotype_phenotype_df.withColumn(
            "assertion_type_id",
            when(
                col("statistical_method").isin(["Manual", "Supplied as data"]),
                lit("ECO:0000218"),
            ).otherwise(lit("ECO:0000203")),
        )

        genotype_phenotype_df = genotype_phenotype_df.withColumn(
            "assertion_type",
            when(
                col("statistical_method").isin(["Manual", "Supplied as data"]),
                lit("manual"),
            ).otherwise(lit("automatic")),
        )

        genotype_phenotype_df = genotype_phenotype_df.select(
            GENOTYPE_PHENOTYPE_COLUMNS + list(ONTOLOGY_STATS_MAP.keys()) +
            ["assertion_type_id", "assertion_type"])
        genotype_phenotype_df = genotype_phenotype_df.withColumn(
            "doc_id",
            monotonically_increasing_id().astype(StringType()))
        ontology_field_prefixes = ["mpath_", "anatomy_"]
        for prefix in ontology_field_prefixes:
            for col_name in genotype_phenotype_df.columns:
                if prefix in col_name:
                    genotype_phenotype_df = genotype_phenotype_df.withColumn(
                        col_name,
                        when(
                            col(col_name).isNotNull(),
                            col(col_name.replace(prefix, "mp_")),
                        ).otherwise(col(col_name)),
                    )
        genotype_phenotype_df.distinct().write.parquet(output_path)
def my_kmeans(dataset, k, output_label ):
    
    data_spark_df01 = spark.read.format('csv').option('header','True').option('index','True').load(dataset)
    new_name = ['first', 'second']
    data_spark_df0 = data_spark_df01.toDF(*new_name)
    
    
    
    data_spark_df1 = data_spark_df0.withColumn("first_numeric", data_spark_df0["first"].cast(FloatType()))
    data_spark_df2 = data_spark_df1.withColumn("second_numeric", data_spark_df1["second"].cast(FloatType()))
    data_spark_df = data_spark_df2.drop('first').drop('second')

    df_centroid = data_spark_df.sample(False, 0.1,seed = 0)
    df_centroid_cache = df_centroid.limit(1).cache()
    df_centroid_cache.show()
    
    new_name = ['x','y']

    df_centroid_cache = df_centroid_cache.toDF(*new_name)
    
    i = 0

    data_cent = data_spark_df.join(broadcast(df_centroid_cache))
    data_cent = data_cent.withColumn(str(i),squaree_spark1(data_cent.columns[0],data_cent.columns[1],
                                                  data_cent.columns[2*i+2],data_cent.columns[2*i+3]))


    data_cent = data_cent.drop(data_cent.columns[i+2]).drop(data_cent.columns[i+3])


    data_cent3 = data_cent.withColumn('mindist',col(str(i)))

    data_cent4 = data_cent3.withColumn('mindist1',least(data_cent3.columns[i+2], col('mindist'))).drop('mindist')

    


    data_cent5 = data_cent4.withColumnRenamed('mindist1','mindist')


    next_selected_cache = data_cent5.orderBy(desc('mindist')).limit(1).cache()

    next_selected = next_selected_cache.select(data_cent5.columns[0:2])

    u = [str(i)+'x',str(i)+'y']
    next_selected = next_selected.toDF(*u)
    data_cent5.explain()
    
    
    start = timer()
    for i in range(1,k,1):
        print(i)

        next_selected_take = next_selected.repartition(2001).cache()#
        next_selected_take.take(1)

        if i == k-1:
            global data_cent11

            data_cent11 = initial_centroids(next_selected_take,data_cent5, i)

        else:
            next_selected_take, data_cent5 = initial_centroids(next_selected_take,data_cent5, i)

            u = [str(i)+'x',str(i)+'y']

            next_selected_take = next_selected_take.toDF(*u)

            next_selected = next_selected_take



    end = timer()
    print ("Execution time HH:MM:SS:",timedelta(seconds=end-start))
    
    data_cent14 = data_cent11.withColumn('defined_cluster',find_min_val_name(*data_cent11.columns[2:3+k])) 

    data_cent16 = data_cent14.select('first_numeric','second_numeric','defined_cluster')

    next_cent17 = data_cent16.repartition(k,'defined_cluster')
    new_centroid = next_cent17.groupBy('defined_cluster').avg('first_numeric', 'second_numeric')
    
    start = timer()

    for i in range(20):
        print(i)
        new_centroid_cache_take = new_centroid.repartition(2001).cache()
        new_centroid_cache_take.take(1)

        new_centroid_cache_take, final_data = UpdateCentroid(data_spark_df,new_centroid_cache_take, k)


        new_centroid = new_centroid_cache_take


    end = timer()
    print ("Execution time HH:MM:SS:",timedelta(seconds=end-start))
    
    final_data1 = final_data.select('defined_cluster')
    final_list = final_data1.toPandas()
    final_label = np.array(list(final_list['defined_cluster']))
    
    #cost_function(output_label, final_label)
   
    
    return  final_data