def scd_analyze(df, merge_on=None, state_col='_state', updated_col='_updated'): add_ids = '##add_ids' del_ids = '##del_ids' upd_ids = '##upd_ids' c = set(df.columns).difference({state_col, updated_col}) colnames = [x for x in df.columns if x in c] on = merge_on or colnames on = on if isinstance(on, (list, tuple)) else [on] on = [c for c in on if c in colnames] s = on + [state_col, updated_col] cols = [x for x in df.columns if x not in s] a = df.filter(f'{state_col} = 0') \ .groupby(updated_col) \ .agg(F.collect_set(F.concat(*on)).alias(add_ids)) \ .select(updated_col, add_ids) d = df.filter(f'{state_col} = 1') \ .groupby(updated_col) \ .agg(F.collect_set(F.concat(*on)).alias(del_ids)) \ .select(updated_col, del_ids) res = a.join(d, on=updated_col, how='outer') res = res.select(updated_col, F.coalesce(add_ids, F.array([])).alias(add_ids), F.coalesce(del_ids, F.array([])).alias(del_ids)) if cols: agg_funcs = [(F.countDistinct(x) - F.lit(1)).alias(x) for x in cols] cnt = df.groupby(*on, updated_col).agg(*agg_funcs) agg_names = [F.lit(x) for x in cols] agg_sums = [F.sum(x) for x in cols] cnt = cnt.groupby(updated_col).agg( F.map_from_arrays(F.array(*agg_names), F.array(*agg_sums)).alias('changes')) res = res.join(cnt, on=updated_col) else: res = res.withColumn('changes', F.lit(None)) res = res.select('*', F.array_intersect(add_ids, del_ids).alias(upd_ids)) res = res.select( F.col(updated_col).alias('updated'), F.size(upd_ids).alias('upd'), F.size(F.array_except(add_ids, upd_ids)).alias('add'), F.size(F.array_except(del_ids, upd_ids)).alias('del'), 'changes') return res.orderBy('updated')
def _get_significance_fields_by_gene(self, stats_results_df): significant_mp_term = stats_results_df.select( "gene_accession_id", "top_level_mp_term_name", "significant" ) significant_mp_term = significant_mp_term.withColumn( "top_level_mp_term_name", functions.explode("top_level_mp_term_name") ) significant_mp_term = significant_mp_term.groupBy("gene_accession_id").agg( functions.collect_set( functions.when( functions.col("significant") == True, functions.col("top_level_mp_term_name"), ).otherwise(functions.lit(None)) ).alias("significant_top_level_mp_terms"), functions.collect_set( functions.when( functions.col("significant") == False, functions.col("top_level_mp_term_name"), ).otherwise(functions.lit(None)) ).alias("not_significant_top_level_mp_terms"), ) significant_mp_term = significant_mp_term.withColumn( "not_significant_top_level_mp_terms", functions.array_except( "not_significant_top_level_mp_terms", "significant_top_level_mp_terms" ), ) significant_mp_term = significant_mp_term.withColumnRenamed( "gene_accession_id", "mgi_accession_id" ) return significant_mp_term
def _init_csv_df(self) -> DataFrame: NULL_FIELD_EXCLUDE_ARRAY = F.array( F.struct( F.lit("").alias("field"), F.lit("").alias("value"), F.lit("").alias("units"), ), F.struct( F.lit(None).alias("field"), F.lit(None).alias("value"), F.lit(None).alias("units"), ), ) assert self._process_fit2csv() base_df = (self._spark.read.csv( f"{self.csv_path}", header=True).withColumnRenamed( "Type", "record_type").where(F.col("record_type") == "Data")) all_fields = F.array( self._get_fit_field_structs(base_df)).alias("all_fields") fields = F.array_except(all_fields, NULL_FIELD_EXCLUDE_ARRAY).alias("fields") return base_df.select( *[F.col(col).alias(col.lower()) for col in RECORD_ATTRS], fields)
def to_user_product_pairs(products_by_customer_df: DataFrame, indexed_user_items: DataFrame) -> DataFrame: """ create item positive and negative pairs. Args: products_by_customer_df (DataFrame): +-----------+--------------------+ |customer_id| products| +-----------+--------------------+ | 10007421|[12537, 27265, 32...| | 10036418|[46420, 34635, 27...| | 10058663|[42536, 42984, 37...| | 10083340|[34617, 45656, 42...| | 10108034|[43418, 46650, 42...| +-----------+--------------------+ products_index (DataFrame): product id and product index mapping. Returns: DataFrame: root |-- <partition_column> customer_id: string (nullable = true) |-- positives: array (nullable = true) | |-- element: integer (containsNull = false) """ products_collection = indexed_user_items.agg( F.collect_set("product_id_index").alias("full_product_index"), ) positive_and_neg = (products_by_customer_df.crossJoin( F.broadcast(products_collection)).withColumn( "negative_ids", F.array_except("full_product_index", "positives_ids")).select( F.col("customer_id"), F.col("customer_id_index"), F.col("cross_bin_number"), F.col("positives_ids").alias("positives"), F.expr("slice(shuffle(negative_ids), 1, size(positives_ids))"). alias("negatives"), )) return positive_and_neg
def to_train_triplet(train_df, test_df, products_index): positives_df = train_df.join(test_df.select( 'customer_id', F.col('products').alias('tests')), on=['customer_id']).select( 'customer_id', 'products', F.concat('products', 'tests').alias('positives'), ) index_set = products_index.select( F.collect_set('product_id_index').alias('idx_set')) train_triplet = positives_df.crossJoin(index_set).withColumn( 'all_negatives', F.shuffle(F.array_except( F.col('idx_set'), F.col('positives')))).select( F.col('customer_id'), F.col('products'), F.expr("slice(all_negatives, 1, size(products))").alias( 'negatives')) return train_triplet
def to_train_triplet(train_df, test_df, products_index): positives_df = train_df.join( test_df.select("customer_id", F.col("products").alias("tests")), on=["customer_id"], ).select( "customer_id", "products", F.concat("products", "tests").alias("positives"), ) index_set = products_index.select( F.collect_set("product_id_index").alias("idx_set")) train_triplet = (positives_df.crossJoin(index_set).withColumn( "all_negatives", F.shuffle(F.array_except(F.col("idx_set"), F.col("positives"))), ).select( F.col("customer_id"), F.col("products"), F.expr("slice(all_negatives, 1, size(products))").alias("negatives"), )) return train_triplet
def get_timeseries(spark, dates, path_prefix='/user/s1840495/WebInsight/'): """ Gets the initial pages and calculates diffs between all next dates for that page. This results in a flow of that page and its behavior. """ print("Time-series using: " + str(dates) + ".") diffInternalOutLinkColumns = [] diffExternalOutLinkColumns = [] timeseries = None for date in dates: path = CreatePath(date[0], date[1], date[2], path_prefix) date_str = '{0}-{1:02d}-{2:02d}'.format(date[0], date[1], date[2]) if timeseries is None: # This is the first one, let's initialize the start. timeseries = spark.read.parquet(path) \ .where(col('url').isNotNull()) \ .where(col('fetch_contentLength') > 0) \ .where(col('fetchMon').isNotNull()) \ .where(col('fetch_semanticVector').isNotNull()) \ .select( 'url', 'fetchMon', 'fetchDay', col('internalOutLinks.targetUrl').alias('internalOutLinks'), col('externalOutLinks.targetUrl').alias('externalOutLinks') ) \ .where(col('fetchMon') == date[1]) \ .where(col('fetchDay') == date[2]) \ .drop('fetchMon') \ .drop('fetchDay') timeseries.show() continue # We have one point in the future, let's create a data point. new_df = spark.read.parquet(path) \ .where(col('fetchMon').isNotNull()) \ .select( 'url', 'fetchMon', 'fetchDay', col('internalOutLinks.targetUrl').alias('newInternalOutLinks'), col('externalOutLinks.targetUrl').alias('newExternalOutLinks') ) \ .where(col('fetchMon') == date[1]) \ .where(col('fetchDay') == date[2]) \ .drop('fetchMon') \ .drop('fetchDay') print("---: " + date_str + " :----") new_df.show() print("Starting join!") diffInternalOutLinkColumn = date_str + '_diffInternalOutLinks' diffInternalOutLinkColumns.append(diffInternalOutLinkColumn) diffExternalOutLinkColumn = date_str + '_diffExternalOutLinks' diffExternalOutLinkColumns.append(diffExternalOutLinkColumn) # Join the two dataframes. # If the new data does not contain information about this link, we keep the old internal/external-out link information. timeseries = timeseries.join(new_df, on='url', how='inner') \ .withColumn(diffExternalOutLinkColumn, size(array_except("newExternalOutLinks", "externalOutLinks"))) \ .withColumn(diffInternalOutLinkColumn, size(array_except("newInternalOutLinks", "internalOutLinks"))) \ .withColumn('externalOutLinks', col('newExternalOutLinks')) \ .drop('newExternalOutLinks') \ .withColumn('internalOutLinks', col('newInternalOutLinks')) \ .drop('newInternalOutLinks') # .withColumn('externalOutLinks', when(col('newExternalOutLinks').isNull(), col('externalOutLinks')).otherwise(col('newExternalOutLinks'))) \ # .withColumn('internalOutLinks', when(col('newInternalOutLinks').isNull(), col('internalOutLinks')).otherwise(col('newInternalOutLinks'))) \ # .drop('newExternalOutLinks') \ # .drop('newInternalOutLinks') print("Timeseries count: " + str(timeseries.count())) timeseries.show() # Cleanup. Merge columns together to arrays. timeseries = timeseries.select( 'url', array(*diffInternalOutLinkColumns).alias("diffInternalOutLinks"), array(*diffExternalOutLinkColumns).alias("diffExternalOutLinks")) print("Finished creation of timeseries.") return timeseries
def _get_datasets_by_gene( self, stats_results_df, observations_df, ontology_metadata_df, compress=True ): significance_cols = [ "female_ko_effect_p_value", "male_ko_effect_p_value", "genotype_effect_p_value", "male_pvalue_low_vs_normal_high", "male_pvalue_low_normal_vs_high", "female_pvalue_low_vs_normal_high", "female_pvalue_low_normal_vs_high", "genotype_pvalue_low_normal_vs_high", "genotype_pvalue_low_vs_normal_high", "male_ko_effect_p_value", "female_ko_effect_p_value", "p_value", "effect_size", "male_effect_size", "female_effect_size", "male_effect_size_low_vs_normal_high", "male_effect_size_low_normal_vs_high", "genotype_effect_size_low_vs_normal_high", "genotype_effect_size_low_normal_vs_high", "female_effect_size_low_vs_normal_high", "female_effect_size_low_normal_vs_high", "significant", "full_mp_term", "metadata_group", "male_mutant_count", "female_mutant_count", "statistical_method", "mp_term_id", "top_level_mp_term_id", "top_level_mp_term_name", "sex", ] data_set_cols = [ "allele_accession_id", "allele_symbol", "gene_symbol", "gene_accession_id", "parameter_stable_id", "parameter_name", "procedure_stable_id", "procedure_name", "pipeline_name", "pipeline_stable_id", "zygosity", "phenotyping_center", "life_stage_name", ] stats_results_df = stats_results_df.select(*(data_set_cols + significance_cols)) stats_results_df = stats_results_df.withColumn( "selected_p_value", functions.when( functions.col("statistical_method").isin( ["Manual", "Supplied as data"] ), functions.col("p_value"), ) .when( functions.col("statistical_method").contains("Reference Range Plus"), functions.when( functions.col("sex") == "male", functions.least( functions.col("male_pvalue_low_vs_normal_high"), functions.col("male_pvalue_low_normal_vs_high"), ), ) .when( functions.col("sex") == "female", functions.least( functions.col("female_pvalue_low_vs_normal_high"), functions.col("female_pvalue_low_normal_vs_high"), ), ) .otherwise( functions.least( functions.col("genotype_pvalue_low_normal_vs_high"), functions.col("genotype_pvalue_low_vs_normal_high"), ) ), ) .otherwise( functions.when( functions.col("sex") == "male", functions.col("male_ko_effect_p_value"), ) .when( functions.col("sex") == "female", functions.col("female_ko_effect_p_value"), ) .otherwise( functions.when( functions.col("statistical_method").contains( "Fisher Exact Test framework" ), functions.col("p_value"), ).otherwise(functions.col("genotype_effect_p_value")) ) ), ) stats_results_df = stats_results_df.withColumn( "selected_p_value", functions.col("selected_p_value").cast(DoubleType()) ) stats_results_df = stats_results_df.withColumn( "selected_effect_size", functions.when( functions.col("statistical_method").isin( ["Manual", "Supplied as data"] ), functions.lit(1.0), ) .when( ~functions.col("statistical_method").contains("Reference Range Plus"), functions.when( functions.col("sex") == "male", functions.col("male_effect_size") ) .when( functions.col("sex") == "female", functions.col("female_effect_size"), ) .otherwise(functions.col("effect_size")), ) .otherwise( functions.when( functions.col("sex") == "male", functions.when( functions.col("male_effect_size_low_vs_normal_high") <= functions.col("male_effect_size_low_normal_vs_high"), functions.col("genotype_effect_size_low_vs_normal_high"), ).otherwise( functions.col("genotype_effect_size_low_normal_vs_high") ), ) .when( functions.col("sex") == "female", functions.when( functions.col("female_effect_size_low_vs_normal_high") <= functions.col("female_effect_size_low_normal_vs_high"), functions.col("genotype_effect_size_low_vs_normal_high"), ).otherwise( functions.col("genotype_effect_size_low_normal_vs_high") ), ) .otherwise(functions.col("effect_size")) ), ) stats_results_df = stats_results_df.withColumn( "selected_phenotype_term", functions.col("mp_term_id") ) observations_df = observations_df.select(*data_set_cols).distinct() datasets_df = observations_df.join( stats_results_df, data_set_cols, "left_outer" ) datasets_df = datasets_df.groupBy(data_set_cols).agg( functions.collect_set( functions.struct( *[ "selected_p_value", "selected_effect_size", "selected_phenotype_term", "metadata_group", "male_mutant_count", "female_mutant_count", "significant", "top_level_mp_term_id", "top_level_mp_term_name", ] ) ).alias("stats_data") ) datasets_df = datasets_df.withColumn( "successful_stats_data", functions.expr( "filter(stats_data, stat -> stat.selected_p_value IS NOT NULL)" ), ) datasets_df = datasets_df.withColumn( "stats_data", functions.when( functions.size("successful_stats_data") > 0, functions.sort_array("successful_stats_data").getItem(0), ).otherwise(functions.sort_array("stats_data").getItem(0)), ) datasets_df = datasets_df.select(*data_set_cols, "stats_data.*") datasets_df = datasets_df.withColumnRenamed("selected_p_value", "p_value") datasets_df = datasets_df.withColumnRenamed( "selected_effect_size", "effect_size" ) datasets_df = datasets_df.withColumnRenamed( "selected_phenotype_term", "phenotype_term_id" ) datasets_df = datasets_df.withColumnRenamed( "top_level_mp_term_id", "top_level_phenotype_term_id" ) datasets_df = datasets_df.withColumn( "top_level_mp_term_name", array_except( col("top_level_mp_term_name"), array(lit(None).cast("string")) ), ) datasets_df = datasets_df.withColumnRenamed( "top_level_mp_term_name", "top_level_phenotype_term_name" ) datasets_df = datasets_df.join( ontology_metadata_df, "phenotype_term_id", "left_outer" ) datasets_df = datasets_df.withColumn( "significance", functions.when( functions.col("significant") == True, functions.lit("Significant") ) .when( functions.col("p_value").isNotNull(), functions.lit("Not significant") ) .otherwise(functions.lit("N/A")), ) mgi_datasets_df = datasets_df.groupBy("gene_accession_id").agg( functions.collect_set( functions.struct( *( data_set_cols + [ "significance", "p_value", "effect_size", "metadata_group", "male_mutant_count", "female_mutant_count", "phenotype_term_id", "phenotype_term_name", "top_level_phenotype_term_id", "top_level_phenotype_term_name", ] ) ) ).alias("datasets_raw_data") ) mgi_datasets_df = mgi_datasets_df.withColumnRenamed( "gene_accession_id", "mgi_accession_id" ) if compress: to_json_udf = functions.udf( lambda row: None if row is None else json.dumps( [ {key: value for key, value in item.asDict().items()} for item in row ] ), StringType(), ) mgi_datasets_df = mgi_datasets_df.withColumn( "datasets_raw_data", to_json_udf("datasets_raw_data") ) compress_and_encode = functions.udf(self._compress_and_encode, StringType()) mgi_datasets_df = mgi_datasets_df.withColumn( "datasets_raw_data", compress_and_encode("datasets_raw_data") ) return mgi_datasets_df