def generate_TFIDF(sc, df , sqlcontext): # 1. calculate the number of rows(documents) in data framework t_num = df.count() # 2. select _id and lower the text_entry and remove punctuation symbols #and then split it as a list of words('tokens') word_spilits = df.select("_id",F.split(F.lower(F.regexp_replace(df.text_entry,'[^\w\s]' ,'')),' ').alias('tokens')) # 3. explode the list of words to generate a list of _id and token #then, group the list base on _id and token to calculate frequency of tokens (tf) in each row # to create a data framework words_tf (_id , token , tf) words_tf = word_spilits.select("_id", F.explode(word_spilits.tokens).alias('token'))\ .groupBy("_id", "token").agg({'token': 'count'}).withColumnRenamed("count(token)", "tf") # 4. to calculate frequency of token in document (df), I aggregate the list base on token # and created a set of _ids with duplicate _ids eliminated ('collect_set') # and calculated the number of _ids and document frequency of a token # to create a data framework words_df (_id , token , df) words_df = words_tf.groupby("token").agg(F.collect_set("_id").alias("_ids"))\ .select("token", F.explode("_ids").alias('_id'), F.size("_ids").alias('df')) # 5. to calculate the final TFIDF data framework, I joined # I joined two data frameworks words_tf and words_df base on same _id and token # then calculated the idf by fraction of number of documents (t_num) on document frequency (df) # then calculated the tf_idf by multiplying idf and tf tokensWithTfIdf = words_tf.join(words_df, (words_tf._id == words_df._id) & (words_tf.token == words_df.token))\ .select(words_tf._id , words_tf.token, words_tf.tf , words_df.df,(F.log10(t_num / words_df.df )).alias("idf")\ , (F.log10(t_num / words_df.df ) * words_tf.tf ).alias("tf_idf") ) # 6. cache the TFIDF data framework for further usage tokensWithTfIdf.cache() return tokensWithTfIdf
def add_computed_cols(entityInfo): # add computed variables entityInfo = entityInfo.withColumn( "log_ncat", F.when(entityInfo["ncat"] > 0, F.log10("ncat")).otherwise(0)) entityInfo = entityInfo.withColumn("log_nCustomers", F.log10(entityInfo["nCustomers"])) entityInfo = entityInfo.withColumn("log_spend", F.log10(entityInfo["spend"] + 0.01)) entityInfo = entityInfo.withColumn("log_orders", F.log10(entityInfo["orders"])) return entityInfo
def createTrans09(sparkDF): # =========================== # douglas fletcher # purpose: create data # transformations (10 at a time) # input: # sparkDF type sparkDF # output: # sparkDFTrans type sparkDF # =========================== sparkDFTrans = sparkDF sparkDFTrans = sparkDFTrans.select("*" , (sparkDFTrans.LogUnknownIncomeDebtRatio - log1p(sparkDFTrans.NumberOfOpenCreditLinesAndLoans)) .alias("LogUnknownIncomeDebtRatioPerLine") ) sparkDFTrans = sparkDFTrans.select("*" , (sparkDFTrans.LogUnknownIncomeDebtRatio - log1p(sparkDFTrans.NumberRealEstateLoansOrLines)) .alias("LogUnknownIncomeDebtRatioPerRealEstateLine") ) sparkDFTrans = sparkDFTrans.select("*" , (sparkDFTrans.LogUnknownIncomeDebtRatio - log1p(sparkDFTrans.NumberOfTimesPastDue)) .alias("LogUnknownIncomeDebtRatioPerDelinquency") ) sparkDFTrans = sparkDFTrans.select("*" , (sparkDFTrans.LogUnknownIncomeDebtRatio - log1p(sparkDFTrans.NumberOfTimes90DaysLate)) .alias("LogUnknownIncomeDebtRatioPer90DaysLate") ) sparkDFTrans = sparkDFTrans.select("*" , (log10(sparkDFTrans.NumberRealEstateLoansOrLines)) .alias("LogNumberRealEstateLoansOrLines") ) sparkDFTrans = sparkDFTrans.withColumn("LogNumberRealEstateLoansOrLines" , when(sparkDFTrans.LogNumberRealEstateLoansOrLines.isNull(), 0) .otherwise(sparkDFTrans.LogNumberRealEstateLoansOrLines) ) sparkDFTrans = sparkDFTrans.drop(sparkDFTrans.NumberRealEstateLoansOrLines) sparkDFTrans = sparkDFTrans.drop(sparkDFTrans.NumberOfOpenCreditLinesAndLoans) sparkDFTrans = sparkDFTrans.drop(sparkDFTrans.NumberOfTimesPastDue) sparkDFTrans = sparkDFTrans.drop(sparkDFTrans.NumberOfTimes90DaysLate) sparkDFTrans = sparkDFTrans.drop(sparkDFTrans.NumberOfTime3059DaysPastDueNotWorse) sparkDFTrans = sparkDFTrans.drop(sparkDFTrans.NumberOfTime6089DaysPastDueNotWorse) sparkDFTrans = sparkDFTrans.select("*" , when(sparkDFTrans.age < 18, 1).otherwise(0) .alias("LowAge") ) sparkDFTrans = sparkDFTrans.select("*" , (log10(sparkDFTrans.age - 17)) .alias("Logage") ) sparkDFTrans = sparkDFTrans.drop(sparkDFTrans.age) return sparkDFTrans
def tf_idf(df, n): # Extracting terms per each row/document as a list temp_df = df.withColumn( 'terms', f.split(f.lower(f.regexp_replace(df.text_entry, '[^\\w\\s-]', '')), ' ')) # Calculating total number of words per row/document temp_df1 = temp_df.withColumn('total_num_words', f.size('terms')) # Extracting words in each documents temp_df2 = temp_df1.withColumn('token', f.explode('terms')) # Calculating tf temp_df3 = temp_df2.groupBy('_id', 'token', 'total_num_words').agg({ 'token': 'count' }).withColumnRenamed('count(token)', 'occurrence').sort('_id') temp_df4 = temp_df3.withColumn('tf', temp_df3.occurrence) # Calculating df temp_df5 = temp_df4.groupBy('token').agg( f.countDistinct('_id')).withColumnRenamed('count(DISTINCT _id)', 'df') # Calculating idf temp_df6 = temp_df5.withColumn('idf', f.log10(n / temp_df5.df)) # Calculating tf-idf joined_df = temp_df4.join(temp_df6, temp_df4.token == temp_df6.token).select( temp_df4.token, temp_df4._id, temp_df4.tf, temp_df6.df, temp_df6.idf) result = joined_df.withColumn('tf_idf', joined_df.tf * joined_df.idf) return result
def get_features_and_labels(transactions_df, transactions_id_cols, transactions_cat_cols): # Get features non_feature_cols = ['isFraud', 'TransactionDT' ] + transactions_id_cols.split(",") feature_cols = [ col for col in transactions_df.columns if col not in non_feature_cols ] logger.info(f'transactions_df columns: {transactions_df}') logger.info(f'transactions_id_cols columns: {transactions_id_cols}') logger.info(f'Feature columns: {feature_cols}') logger.info("Categorical columns: {}".format( transactions_cat_cols.split(","))) features = transactions_df.select(feature_cols) kdf_features = features.to_koalas() kdf_features = ks.get_dummies( kdf_features, columns=transactions_cat_cols.split(",")).fillna(0) features = kdf_features.to_spark() features = features.withColumn('TransactionAmt', fc.log10(fc.col('TransactionAmt'))) logger.info("Transformed feature columns: {}".format(list( features.columns))) logger.info("Transformed feature count: {}".format(features.count())) # Get labels labels = transactions_df.select('TransactionID', 'isFraud') logger.info("Transformed label columns: {}".format(list(labels.columns))) logger.info("Shape of label: {}".format(labels.count())) return features, labels
def apply_tracklet_cuts(df: DataFrame) -> DataFrame: """ Select potential tracklet candidates based on property cuts. We first apply 3 criteria to select interesting candidates: 1. remove alerts with possible counterpart in MPC 2. remove alerts with negative fluxes 3. keep only alerts with 1 detection Then, based on Sergey's analysis, we limit the analysis to the candidates outside the locus of variable stars (and bad subtractions). Parameters ---------- df: Spark DataFrame Input dataframe containing alert data Returns ---------- df_filt: Spark DataFrame Spark DataFrame of smaller size containing only potential tracklet candidate data based on the cuts. Examples ---------- >>> df = spark.read.format('parquet').load(ztf_alert_sample) >>> df_filt = apply_tracklet_cuts(df) >>> df_filt.count() 16 """ # remove known asteroids idx = df['candidate.ssnamenr'] == 'null' # Keep only objects unknown to SIMBAD - seems unnecessary # idx &= df['cdsxmatch'].isin(['Unknown']) # Keep only objects with 1 detection idx &= df['candidate.ndethist'] == 1 # Keep only positive detections idx &= df['candidate.isdiffpos'] == 't' # Simple definition of locus containing (most of) stellar variability # as well as bad subtraction - basically, the variations are fainter than # the template object itself, and distance is smaller than typical FWHM shiftlog = F.log10(df['candidate.distnr']) + 0.2 nidx = (df['candidate.magnr'] - df['candidate.magpsf']) < 1.0 nidx &= (df['candidate.magnr'] - df['candidate.magpsf']) < (-4 * shiftlog) nidx &= df['candidate.distnr'] < 2 df_filt = df.filter(idx & ~nidx).cache() return df_filt
def indexing(df_data): df_data2 = df_data.select(df_data._id, removepunctuations(df_data.text_entry)) only_words = Tokenizer(inputCol='textentry', outputCol="words") df_data3 = only_words.transform(df_data2) df_data4 = df_data3.select(df_data3._id, df_data3.textentry, explode(df_data3.words).alias('token_words')) term_freq = df_data4.groupBy("_id", "token_words").agg( count("token_words").alias("TF")) doc_freq = df_data4.groupBy("token_words").agg( countDistinct("_id").alias("DF")) idf_calc = doc_freq.withColumn('idf', (111396.0) / doc_freq['DF']) idf_calc = idf_calc.withColumn("IDF", log10("idf")) tf_idf = term_freq.join(idf_calc, "token_words", "left").withColumn("TF-IDF", col("TF") * col("IDF")) return tf_idf
def run_entity_extraction(srcFilePath, partName): sqlContext.clearCache() df = load_data(srcFilePath, data_table + "_" + partName) df.printSchema() # filter out invalid data # df = df.filter(df[entity].isNotNull()) orderUserInfoDF = build_order_tbl(df) entityUserDF = build_entity_user_tbl(orderUserInfoDF) entityInfoDF = analyse_transactions(df) # add proj Wt weightColPerOrder = orderUserInfoDF.groupby(entity).agg( F.sum(weightCol).alias(weightCol)) entityInfoDF = entityInfoDF.join(weightColPerOrder, entity) #add growthRate entityInfoDF = entityInfoDF.join( build_frac_growth_rate_tbl(orderUserInfoDF), entity, how="left_outer") print 'generating demographics' demographicTbl = generate_demographic_info(entityUserDF, userCols) entityInfoDF = entityInfoDF.join(demographicTbl, entity, how='left_outer') # finally add bunch of log values for selected cols cols_for_log = [ x for x, y in entityInfoDF.dtypes if any(map(x.startswith, summaryOps['logs'])) if y in ['double', 'float', 'int', 'long', 'bigint'] ] print "gen log for cols", cols_for_log for col in cols_for_log: entityInfoDF = entityInfoDF.withColumn( "log_" + col, F.when(entityInfoDF[col] > 0, F.log10(col)).otherwise(0)) # entityInfoDF = entityInfoDF.cache() entityInfoDF.repartition(1).write.mode("overwrite").format('com.databricks.spark.csv') \ .options(header='true', mode="overwrite") \ .save('/mnt/' + AWS_BUCKET_NAME + '/' + destEntityInfoPath) orderUserInfoDF.unpersist() entities = entityInfoDF.select(entity).map(lambda r: r[entity]).collect() return (entities, entityUserDF)
def createTrans07(sparkDF): # =========================== # douglas fletcher # purpose: create data # transformations (10 at a time) # input: # sparkDF type sparkDF # output: # sparkDFTrans type sparkDF # =========================== sparkDFTrans = sparkDF sparkDFTrans = sparkDFTrans.select("*" , (sparkDFTrans.NumberRealEstateLoansOrLines /(1+sparkDFTrans.NumberOfDependents)) .alias("RealEstateLoansPerPerson") ) sparkDFTrans = sparkDFTrans.select("*" , (sparkDFTrans.age /(1+sparkDFTrans.NumberOfDependents)) .alias("YearsOfAgePerDependent") ) sparkDFTrans = sparkDFTrans.select("*" , log10(sparkDFTrans.MonthlyIncome) .alias("LogMonthlyIncome") ) sparkDFTrans = sparkDFTrans.withColumn("LogMonthlyIncome" , when((sparkDFTrans.LogMonthlyIncome.isNull()) | (sparkDFTrans.LogMonthlyIncome.isNull()), 0) .otherwise(sparkDFTrans.LogMonthlyIncome) ) sparkDFTrans = sparkDFTrans.drop("MonthlyIncome") sparkDFTrans = sparkDFTrans.select("*" , (sparkDFTrans.LogMonthlyIncome - log1p(sparkDFTrans.NumberOfDependents)) .alias("LogIncomePerPerson") ) sparkDFTrans = sparkDFTrans.select("*" , (sparkDFTrans.LogMonthlyIncome - log1p(sparkDFTrans.age)) .alias("LogIncomeAge") ) sparkDFTrans = sparkDFTrans.select("*" , log10(sparkDFTrans.NumberOfTimesPastDue) .alias("LogNumberOfTimesPastDue") ) sparkDFTrans = sparkDFTrans.withColumn("LogNumberOfTimesPastDue" , when(sparkDFTrans.LogNumberOfTimesPastDue.isNull(), 0) .otherwise(sparkDFTrans.LogNumberOfTimesPastDue) ) sparkDFTrans = sparkDFTrans.select("*" , log10(sparkDFTrans.NumberOfTimes90DaysLate) .alias("LogNumberOfTimes90DaysLate") ) sparkDFTrans = sparkDFTrans.withColumn("LogNumberOfTimes90DaysLate" , when(sparkDFTrans.LogNumberOfTimes90DaysLate.isNull(), 0) .otherwise(sparkDFTrans.LogNumberOfTimes90DaysLate) ) sparkDFTrans = sparkDFTrans.select("*" , log10(sparkDFTrans.NumberOfTime3059DaysPastDueNotWorse) .alias("LogNumberOfTime3059DaysPastDueNotWorse") ) sparkDFTrans = sparkDFTrans.withColumn("LogNumberOfTime3059DaysPastDueNotWorse" , when(sparkDFTrans.LogNumberOfTime3059DaysPastDueNotWorse.isNull(), 0) .otherwise(sparkDFTrans.LogNumberOfTime3059DaysPastDueNotWorse) ) sparkDFTrans = sparkDFTrans.select("*" , log10(sparkDFTrans.NumberOfTime6089DaysPastDueNotWorse) .alias("LogNumberOfTime6089DaysPastDueNotWorse") ) sparkDFTrans = sparkDFTrans.withColumn("LogNumberOfTime6089DaysPastDueNotWorse" , when(sparkDFTrans.LogNumberOfTime6089DaysPastDueNotWorse.isNull(), 0) .otherwise(sparkDFTrans.LogNumberOfTime6089DaysPastDueNotWorse) ) sparkDFTrans = sparkDFTrans.select("*" , (sparkDFTrans.LogNumberOfTimes90DaysLate - sparkDFTrans.LogNumberOfTime3059DaysPastDueNotWorse) .alias("LogRatio90to3059DaysLate") ) return sparkDFTrans
def createTrans06(sparkDF): # =========================== # douglas fletcher # purpose: create data # transformations (10 at a time) # input: # sparkDF type sparkDF # output: # sparkDFTrans type sparkDF # =========================== sparkDFTrans = sparkDF sparkDFTrans = sparkDFTrans.select("*" , (log10(sparkDFTrans.RevolvingUtilizationOfUnsecuredLines)) .alias("LogRevolvingUtilizationOfUnsecuredLines") ) sparkDFTrans = sparkDFTrans.withColumn("LogRevolvingUtilizationOfUnsecuredLines" , when(sparkDFTrans.LogRevolvingUtilizationOfUnsecuredLines.isNull(), 0) .otherwise(sparkDFTrans.LogRevolvingUtilizationOfUnsecuredLines) ) sparkDFTrans = sparkDFTrans.drop("RevolvingUtilizationOfUnsecuredLines") sparkDFTrans = sparkDFTrans.select("*" , (sparkDFTrans.NumberOfTimesPastDue / sparkDFTrans.NumberOfOpenCreditLinesAndLoans) .alias("DelinquenciesPerLine") ) sparkDFTrans = sparkDFTrans.withColumn("DelinquenciesPerLine" , when(sparkDFTrans.NumberOfOpenCreditLinesAndLoans == 0, 0) .otherwise(sparkDFTrans.DelinquenciesPerLine) ) sparkDFTrans = sparkDFTrans.select("*" , (sparkDFTrans.NumberOfTimes90DaysLate / sparkDFTrans.NumberOfOpenCreditLinesAndLoans) .alias("MajorDelinquenciesPerLine") ) sparkDFTrans = sparkDFTrans.withColumn("MajorDelinquenciesPerLine" , when(sparkDFTrans.NumberOfOpenCreditLinesAndLoans == 0, 0) .otherwise(sparkDFTrans.MajorDelinquenciesPerLine) ) sparkDFTrans = sparkDFTrans.select("*" , (sparkDFTrans.NumberOfTime3089DaysPastDueNotWorse / sparkDFTrans.NumberOfOpenCreditLinesAndLoans) .alias("MinorDelinquenciesPerLine") ) sparkDFTrans = sparkDFTrans.withColumn("MinorDelinquenciesPerLine" , when(sparkDFTrans.NumberOfOpenCreditLinesAndLoans == 0, 0) .otherwise(sparkDFTrans.MinorDelinquenciesPerLine) ) sparkDFTrans = sparkDFTrans.select("*" , (sparkDFTrans.NumberOfTimesPastDue / sparkDFTrans.RevolvingLines) .alias("DelinquenciesPerRevolvingLine") ) sparkDFTrans = sparkDFTrans.withColumn("DelinquenciesPerRevolvingLine" , when(sparkDFTrans.RevolvingLines == 0, 0) .otherwise(sparkDFTrans.DelinquenciesPerRevolvingLine) ) sparkDFTrans = sparkDFTrans.select("*" , (sparkDFTrans.NumberOfTimes90DaysLate / sparkDFTrans.RevolvingLines) .alias("MajorDelinquenciesPerRevolvingLine") ) sparkDFTrans = sparkDFTrans.withColumn("MajorDelinquenciesPerRevolvingLine" , when(sparkDFTrans.RevolvingLines == 0, 0) .otherwise(sparkDFTrans.MajorDelinquenciesPerRevolvingLine) ) sparkDFTrans = sparkDFTrans.select("*" , (sparkDFTrans.NumberOfTime3089DaysPastDueNotWorse / sparkDFTrans.RevolvingLines) .alias("MinorDelinquenciesPerRevolvingLine") ) sparkDFTrans = sparkDFTrans.withColumn("MinorDelinquenciesPerRevolvingLine" , when(sparkDFTrans.RevolvingLines == 0, 0) .otherwise(sparkDFTrans.MinorDelinquenciesPerRevolvingLine) ) sparkDFTrans = sparkDFTrans.select("*" , (sparkDFTrans.LogDebt - log1p(sparkDFTrans.NumberOfOpenCreditLinesAndLoans)) .alias("LogDebtPerLine") ) sparkDFTrans = sparkDFTrans.select("*" , (sparkDFTrans.LogDebt - log1p(sparkDFTrans.NumberRealEstateLoansOrLines)) .alias("LogDebtPerRealEstateLine") ) sparkDFTrans = sparkDFTrans.select("*" , (sparkDFTrans.LogDebt - log1p(sparkDFTrans.NumberOfDependents)) .alias("LogDebtPerPerson") ) sparkDFTrans = sparkDFTrans.select("*" , (sparkDFTrans.RevolvingLines /(1+sparkDFTrans.NumberOfDependents)) .alias("RevolvingLinesPerPerson") ) return sparkDFTrans
def createTrans02(sparkDF): # =========================== # douglas fletcher # purpose: create data # transformations (10 at a time) # input: # sparkDF type sparkDF # output: # sparkDFTrans type sparkDF # =========================== sparkDFTrans = sparkDF sparkDFTrans = sparkDFTrans.select("*" , when(sparkDFTrans.RevolvingUtilizationOfUnsecuredLines == 0, 1) .otherwise(0) .alias("ZeroRevolvingUtilization") ) sparkDFTrans = sparkDFTrans.withColumn("RevolvingUtilizationOfUnsecuredLines" , when(log10(sparkDFTrans.RevolvingUtilizationOfUnsecuredLines) > 3, 0) .otherwise(sparkDFTrans.RevolvingUtilizationOfUnsecuredLines) ) sparkDFTrans = sparkDFTrans.select("*" , when(sparkDFTrans.MonthlyIncome <= 1, 1) .otherwise(sparkDFTrans.MonthlyIncome) .alias("LogDebt") ) sparkDFTrans = sparkDFTrans.withColumn("LogDebt" , log10(sparkDFTrans.LogDebt * sparkDFTrans.DebtRatio) ) sparkDFTrans = sparkDFTrans.withColumn("LogDebt" , when(sparkDFTrans.LogDebt.isNull(), 0) .otherwise(sparkDFTrans.LogDebt) ) sparkDFTrans = sparkDFTrans.withColumn("RevolvingLines" , sparkDFTrans.NumberOfOpenCreditLinesAndLoans - sparkDFTrans.NumberRealEstateLoansOrLines ) sparkDFTrans = sparkDFTrans.select("*" , when(sparkDFTrans.RevolvingLines > 0, 1) .otherwise(0) .alias("HasRevolvingLines") ) sparkDFTrans = sparkDFTrans.select("*" , when(sparkDFTrans.NumberRealEstateLoansOrLines > 0, 1) .otherwise(0) .alias("HasRealEstateLoans") ) sparkDFTrans = sparkDFTrans.select("*" , when(sparkDFTrans.NumberRealEstateLoansOrLines > 2, 1) .otherwise(0) .alias("HasMultipleRealEstateLoans") ) sparkDFTrans = sparkDFTrans.select("*" , when(sparkDFTrans.age > 60, 1) .otherwise(0) .alias("EligibleSS") ) sparkDFTrans = sparkDFTrans.select("*" , when((sparkDFTrans.NoIncome == 0) & (sparkDFTrans.DebtRatio > 0.33), 1) .otherwise(0) .alias("DTIOver33") ) sparkDFTrans = sparkDFTrans.select("*" , when((sparkDFTrans.NoIncome == 0) & (sparkDFTrans.DebtRatio > 0.43), 1) .otherwise(0) .alias("DTIOver43") ) return sparkDFTrans
def parse_genebass_evidence(genebass_df: DataFrame) -> DataFrame: """ Parse Genebass's disease/target evidence. Args: genebass_df: DataFrame with Genebass's portal data Returns: evd_df: DataFrame with Genebass's data following the t/d evidence schema. """ to_keep = [ 'datasourceId', 'datatypeId', 'targetFromSourceId', 'diseaseFromSource', 'diseaseFromSourceId', 'diseaseFromSourceMappedId', 'pValueMantissa', 'pValueExponent', 'beta', 'betaConfidenceIntervalLower', 'betaConfidenceIntervalUpper', 'oddsRatio', 'oddsRatioConfidenceIntervalLower', 'oddsRatioConfidenceIntervalUpper', 'resourceScore', 'ancestry', 'ancestryId', 'projectId', 'cohortId', 'studySampleSize', 'studyCases', 'statisticalMethod', 'statisticalMethodOverview', ] # WARNING: There are some associations with a p-value of 0.0 in Genebass. # This is a bug we still have to ellucidate and it might be due to a float overflow. # These evidence need to be manually corrected in order not to lose them and for them to pass validation # As an interim solution, their p value will equal to the minimum in the evidence set. logging.warning( f"There are {genebass_df.filter(col('Pvalue_Burden') == 0.0).count()} evidence with a p-value of 0.0." ) minimum_pvalue = ( genebass_df.filter(col('Pvalue_Burden') > 0.0).agg({'Pvalue_Burden': 'min'}).collect()[0]['min(Pvalue_Burden)'] ) genebass_df = genebass_df.withColumn( 'Pvalue_Burden', when(col('Pvalue_Burden') == 0.0, lit(minimum_pvalue)).otherwise(col('Pvalue_Burden')) ) return ( genebass_df.withColumn('datasourceId', lit('gene_burden')) .withColumn('datatypeId', lit('genetic_association')) .withColumn('projectId', lit('Genebass')) .withColumn('cohortId', lit('UK Biobank 450k')) .withColumn('ancestry', lit('EUR')) .withColumn('ancestryId', lit('HANCESTRO_0009')) .withColumnRenamed('gene_id', 'targetFromSourceId') .withColumnRenamed('description', 'diseaseFromSource') .withColumnRenamed('phenocode', 'diseaseFromSourceId') .join( import_trait_mappings(), on='diseaseFromSource', how='left', ) .withColumnRenamed('Pvalue_Burden', 'resourceScore') .withColumn('pValueExponent', log10(col('resourceScore')).cast(IntegerType()) - lit(1)) .withColumn('pValueMantissa', round(col('resourceScore') / pow(lit(10), col('pValueExponent')), 3)) # Stats are split taking into consideration the type of the trait # Those that are not continuous or categorical were reviewed and all of them are considered as categorical .withColumn( 'beta', when(col('trait_type') == 'continuous', col('BETA_Burden')), ) .withColumn( 'betaConfidenceIntervalLower', when(col('trait_type') == 'continuous', col('BETA_Burden') - col('SE_Burden')), ) .withColumn( 'betaConfidenceIntervalUpper', when(col('trait_type') == 'continuous', col('BETA_Burden') + col('SE_Burden')), ) .withColumn( 'oddsRatio', when(col('trait_type').isin(['categorical', 'icd_first_occurrence', 'icd10']), col('BETA_Burden')), ) .withColumn( 'oddsRatioConfidenceIntervalLower', when( col('trait_type').isin(['categorical', 'icd_first_occurrence', 'icd10']), col('BETA_Burden') - col('SE_Burden'), ), ) .withColumn( 'oddsRatioConfidenceIntervalUpper', when( col('trait_type').isin(['categorical', 'icd_first_occurrence', 'icd10']), col('BETA_Burden') + col('SE_Burden'), ), ) .withColumn('studySampleSize', (col('n_cases') + coalesce('n_controls', lit(0)))) .withColumnRenamed('n_cases', 'studyCases') .withColumnRenamed('annotation', 'statisticalMethod') .withColumn('statisticalMethodOverview', col('statisticalMethod')) .replace(to_replace=METHOD_DESC, subset=['statisticalMethodOverview']) .select(to_keep) .distinct() )
print(data_reduce.select("GDP").rdd.max()[0]) print(data_reduce.select("GDP").rdd.min()[0]) # In[14]: import math from pyspark.sql.functions import col from pyspark.sql.types import FloatType from pyspark.sql import functions as F print(data_reduce.head()) for i in range(0, 17): if (i == 1): continue data_reduce = data_reduce.withColumn( data_reduce.columns[i], F.log10(col(data_reduce.columns[i]) + 1)) print(data_reduce.head()) # In[15]: x = [i for i in range(0, data_reduce.count())] get_ipython().run_line_magic('matplotlib', 'notebook') for i in range(0, 17): plt.subplot(6, 3, i + 1) y = data_reduce.select(data_reduce.columns[i]).collect() plt.scatter(x, y, color='green', marker='o', edgecolor='black', alpha=0.5) plt.title(data_reduce.columns[i]) plt.show() # # 6.3
def createTrans01(sparkDF): # =========================== # douglas fletcher # purpose: create data # transformations (10 at a time) # input: # sparkDF type sparkDF # output: # sparkDFTrans type sparkDF # =========================== sparkDFTrans = sparkDF sparkDFTrans = sparkDFTrans.select("*" , when(sparkDFTrans.NumberOfDependents.isNull(), 1) .otherwise(0) .alias("UnknownNumberOfDependents") ) sparkDFTrans = sparkDFTrans.select("*" , when(sparkDFTrans.MonthlyIncome.isNull(), 1) .otherwise(0) .alias("UnknownMonthlyIncome") ) sparkDFTrans = sparkDFTrans.select("*" , when(sparkDFTrans.NumberOfDependents == 0, 1) .otherwise(0) .alias("NoDependents") ) sparkDFTrans = sparkDFTrans.withColumn("NumberOfDependents" , when(sparkDFTrans.UnknownNumberOfDependents == 1, 0) .otherwise(sparkDFTrans.NumberOfDependents) ) sparkDFTrans = sparkDFTrans.select("*" , when(sparkDFTrans.MonthlyIncome == 0, 1) .otherwise(0) .alias("NoIncome") ) sparkDFTrans = sparkDFTrans.withColumn("NoIncome" , when(sparkDFTrans.NoIncome.isNull(), 0).otherwise(sparkDFTrans.NoIncome) ) sparkDFTrans = sparkDFTrans.withColumn("MonthlyIncome" , when(sparkDFTrans.UnknownMonthlyIncome == 1, 0) .otherwise(sparkDFTrans.MonthlyIncome) ) sparkDFTrans = sparkDFTrans.select("*" , when(sparkDFTrans.DebtRatio == 0, 1) .otherwise(0) .alias("ZeroDebtRatio") ) sparkDFTrans = sparkDFTrans.select("*" , when(sparkDFTrans.DebtRatio == 0, 0) .otherwise(sparkDFTrans.DebtRatio) .alias("UnknownIncomeDebtRatio") ) sparkDFTrans = sparkDFTrans.withColumn("DebtRatio" , when(sparkDFTrans.UnknownMonthlyIncome == 1, 0) .otherwise(sparkDFTrans.DebtRatio) ) sparkDFTrans = sparkDFTrans.select("*" , when(log10(sparkDFTrans.RevolvingUtilizationOfUnsecuredLines) > 3 , sparkDFTrans.RevolvingUtilizationOfUnsecuredLines) .otherwise(0) .alias("WeirdRevolvingUtilization") ) return sparkDFTrans
def main(): # Args args = parse_args() # args.in_ld_folder = 'input_data/ld_each_variant' # args.in_manifest = 'input_data/190625/ld_analysis_input.tsv' # args.in_top_loci = 'input_data/190625/toploci.parquet' # args.out = 'output/ld_w_crediblesets.parquet' # args.min_r2 = 0.5 # Make spark session global spark spark = (pyspark.sql.SparkSession.builder.config("spark.master", "local[*]").getOrCreate()) print('Spark version: ', spark.version) # # Load data --------------------------------------------------------------- # # Load LD ld = ( load_ld(args.in_ld_folder).withColumn( 'index_variant_id', regexp_replace(col('index_variant_id'), ':', '_')).withColumn( 'tag_variant_id', regexp_replace(col('tag_variant_id'), ':', '_')) # .limit(10000) # Debug ) # Load manifest manifest = (load_manifest(args.in_manifest).withColumnRenamed( 'variant_id', 'index_variant_id')) # # Weight correlations by study population --------------------------------- # # Join LD to manifest data = manifest.join(ld, on='index_variant_id') # Replace R fields for coln in ['R_AFR', 'R_AMR', 'R_EAS', 'R_EUR', 'R_SAS']: data = ( data # Replace all R values == 1 with 0.9999995, otherwise we get error # This is reverted later by rounding to 6 dp .withColumn(coln, when(col(coln) == 1, 0.9999995).otherwise(col(coln)) ) # Fill nulls with 0 .withColumn(coln, when(col(coln).isNull(), 0).otherwise(col(coln)) ) ) # Fisher transform correlations to z-scores for coln in ['R_AFR', 'R_AMR', 'R_EAS', 'R_EUR', 'R_SAS']: data = data.withColumn(coln.replace('R_', 'Z_'), arctanh(col(coln))) # Compute weighted average across populations data = data.withColumn( 'Z_overall', ((col('AFR_prop') * col('Z_AFR')) + (col('AMR_prop') * col('Z_AMR')) + (col('EAS_prop') * col('Z_EAS')) + (col('EUR_prop') * col('Z_EUR')) + (col('SAS_prop') * col('Z_SAS')))) # Inverse Fisher transform weigthed z-score back to correlation data = data.withColumn('R_overall', tanh(col('Z_overall'))) # Round R_overall to 6 dp data = data.withColumn('R_overall', round6dp(col('R_overall'))) # Convert R to R2 data = data.withColumn('R2_overall', pow(col('R_overall'), 2)) # Drop rows where R2 is null data = data.filter(col('R2_overall').isNotNull()) # Filter based on overall R2 data = data.filter(col('R2_overall') >= args.min_r2) # Drop unneeded columns data = data.drop(*[ 'Z_overall', 'R_overall', 'R_AFR', 'R_AMR', 'R_EAS', 'R_EUR', 'R_SAS', 'Z_AFR', 'Z_AMR', 'Z_EAS', 'Z_EUR', 'Z_SAS', 'index_variant_id' ]) # Denormalise variant IDs data = (data.withColumnRenamed('chrom', 'lead_chrom').withColumnRenamed( 'pos', 'lead_pos').withColumnRenamed( 'ref', 'lead_ref').withColumnRenamed('alt', 'lead_alt').withColumn( 'tag_split', split(col('tag_variant_id'), '_')).withColumn( 'tag_chrom', col('tag_split').getItem(0)).withColumn( 'tag_pos', col('tag_split').getItem(1).cast('int')).withColumn( 'tag_ref', col('tag_split').getItem(2)).withColumn( 'tag_alt', col('tag_split').getItem(3)).drop( 'tag_split', 'tag_variant_id')) # # Conduct credible set analysis using PICS adjustment --------------------- # ''' Probabilistic Identification of Causal SNPs (PICS) from Farh (2014): https://www.nature.com/articles/nature13835 Adjusts the p-values for tag SNPs based on the p-value of the lead SNP and it's LD. ''' # Empiric constant that can be adjusted to fit the curve, 6.4 recommended. k = 6.4 # Load toploci toploci = spark.read.parquet(args.in_top_loci) # Join negative log pvalue from toploci onto data toploci = (toploci.withColumn( 'neglog_p', -1 * (log10(col('pval_mantissa')) + col('pval_exponent'))).withColumnRenamed( 'chrom', 'lead_chrom').withColumnRenamed( 'pos', 'lead_pos').withColumnRenamed( 'ref', 'lead_ref').withColumnRenamed( 'alt', 'lead_alt').select('study_id', 'lead_chrom', 'lead_pos', 'lead_ref', 'lead_alt', 'neglog_p')) data = data.join( toploci, on=['study_id', 'lead_chrom', 'lead_pos', 'lead_ref', 'lead_alt']) # Calculate PICS statistics data = (data.withColumn('pics_mu', col('R2_overall') * col('neglog_p')).withColumn( 'pics_std', sqrt(1 - sqrt(col('R2_overall'))**k) * sqrt(col('neglog_p')) / 2).withColumn( 'pics_relative_prob', when(col('pics_std') == 0, 1.0).otherwise( norm_sf(col('pics_mu'), col('pics_std'), col('neglog_p'))))) # Calculate the sum of the posterior probabilities at each locus pics_prob_sums = (data.groupby( 'study_id', 'lead_chrom', 'lead_pos', 'lead_ref', 'lead_alt').agg( sum('pics_relative_prob').alias('pics_relative_prob_sum'))) # Merge back onto data data = data.join( pics_prob_sums, on=['study_id', 'lead_chrom', 'lead_pos', 'lead_ref', 'lead_alt']) # Calculate posterior probability at each locus data = (data.withColumn( 'pics_postprob', col('pics_relative_prob') / col('pics_relative_prob_sum')).drop( 'pics_relative_prob_sum', 'neglog_p')) # Calculate cumulative sum per locus window_spec = (Window.partitionBy('study_id', 'lead_chrom', 'lead_pos', 'lead_ref', 'lead_alt').orderBy( desc('pics_postprob')).rowsBetween( Window.unboundedPreceding, Window.currentRow)) data = (data.withColumn('pics_postprob_cumsum', sum('pics_postprob').over(window_spec))) # Label whether each row is in the 95 and 99% credible sets window_spec = (Window.partitionBy( 'study_id', 'lead_chrom', 'lead_pos', 'lead_ref', 'lead_alt').orderBy('pics_postprob_cumsum')) data = (data.withColumn( 'pics_95perc_credset', when(lag('pics_postprob_cumsum', 1).over(window_spec) >= 0.95, False).otherwise(True)).withColumn( 'pics_99perc_credset', when( lag('pics_postprob_cumsum', 1).over(window_spec) >= 0.99, False).otherwise(True))) # # Write output ------------------------------------------------------------ # # Rename columns and format data = (data.withColumnRenamed( 'AFR_prop', 'AFR_1000G_prop').withColumnRenamed( 'AMR_prop', 'AMR_1000G_prop').withColumnRenamed( 'EAS_prop', 'EAS_1000G_prop').withColumnRenamed( 'EUR_prop', 'EUR_1000G_prop').withColumnRenamed( 'SAS_prop', 'SAS_1000G_prop').withColumnRenamed( 'R2_overall', 'overall_r2').select( 'study_id', 'lead_chrom', 'lead_pos', 'lead_ref', 'lead_alt', 'tag_chrom', 'tag_pos', 'tag_ref', 'tag_alt', 'overall_r2', 'pics_mu', 'pics_postprob', 'pics_95perc_credset', 'pics_99perc_credset', 'AFR_1000G_prop', 'AMR_1000G_prop', 'EAS_1000G_prop', 'EUR_1000G_prop', 'SAS_1000G_prop')) # Save output (data.repartitionByRange('study_id', 'lead_chrom', 'lead_pos').write.parquet(args.out, mode='overwrite')) return 0
sf.lit(h3_resolution))) # group by the h3 grid .groupBy("h3") # grab counts .count() # add the centroid .withColumn("h3_centroid", h3_hex_centroid(sf.col("h3"))) # rename columns .select("h3", "count", sf.col("h3_centroid.lat").alias("lat"), sf.col("h3_centroid.lon").alias("lon"), sf.log10("count").alias("log_count")) # ensure sparse representation .filter(sf.col("count") > sf.lit(0))) # COMMAND ---------- # MAGIC %md __Convert results to Pandas__ # COMMAND ---------- pandas_df = h3_aggregation.toPandas() # COMMAND ---------- # MAGIC %md __Export results to CSV__
def mysolution(sc,spark): # Load data/DataSample.csv to Spark DataFrame df_dataSample = spark.read.option("header",True).csv("data\DataSample.csv") print('\nDisplay Schema of DataSample.csv dataset table\n') df_dataSample.printSchema() # In[9]: #Display the contents of DataSample data print('\nDisplay contents of DataSample.csv dataset table\n') df_dataSample.show() # ### 1. Cleanup # # A sample dataset of request logs is given in data/DataSample.csv. We consider records that have identical geoinfo and timest as suspicious. Please clean up the sample dataset by filtering out those suspicious request records. # In[22]: # Drop duplicate rows based on columns TimeSt, Latitude and Longitude df_clean = df_dataSample.dropDuplicates(['Latitude', 'Longitude']).dropDuplicates([' TimeSt']) print ('\nDisplay clean dataset after dropping suspicius requests (i.e., duplicate geoinfo and timest)\n') df_clean.show() print ("\n------------------------END OF ANSWER #1------------------------\n") # **End of Answer #1** # # --- # ### 2. Label # Assign each request (from data/DataSample.csv) to the closest (i.e. minimum distance) POI (from data/POIList.csv). # # **Note:** A POI is a geographical Point of Interest. # In[23]: #Load data from data/POIList.csv in Spark Dataframe df_poil = spark.read.option("header",True).csv("data\POIList.csv") print ('\nDisplay Schema and data of POIList dataset table\n') df_poil.printSchema() df_poil.show(5) # In[24]: #Convert pois Spark DataFrame to Pandas Dataframe df_pd_pois = df_poil.toPandas() # In[69]: #Python-UDF to find POI with minimum distance to each entry of DataSample def myfun(la2, lo2): min_dis = 1.0e10 poi_id = df_pd_pois.loc[0,'POIID'] for i, (la1,lo1) in enumerate( zip(df_pd_pois[' Latitude'], df_pd_pois['Longitude'])): la1, lo1 = float(la1), float(lo1) dis = math.sqrt((la1-la2)**2 + (lo1-lo2)**2) if min_dis > dis: min_dis = dis poi_id = df_pd_pois.loc[i,'POIID'] return ([poi_id, min_dis]) #Register Python-UDF with Spark-UDF myfun_spark = F.udf(myfun, ArrayType(StringType())) df_poi = df_clean.withColumn('temp_col', myfun_spark( F.col('Latitude').cast(FloatType()), F.col('Longitude').cast(FloatType()) )).cache()\ .withColumn('POI', F.col('temp_col')[0])\ .withColumn('POI_DIS', F.col('temp_col')[1].cast(DoubleType()))\ .drop('temp_col') print('Display the dataframe with new columns of nearest POI and POI_DIS(i.e, distance to POI from request)') df_poi.show(5) print ("\n------------------------END OF ANSWER #2------------------------\n") # **End of Answer #2** # # --- # ### 3. Analysis # For each POI, calculate the average and standard deviation of the distance between the POI to each of its assigned requests. # # At each POI, draw a circle (with the center at the POI) that includes all of its assigned requests. Calculate the radius and density (requests/area) for each POI. # In[70]: #Group the dataframe df_poi on 'POI' column and calculate average and standard deviation on each group df_avgSD = df_poi.groupby('POI').agg(F.avg('POI_DIS').alias('Average'), F.stddev('POI_DIS').alias('Std_Dev')) #Left Join df_avgSD dataframe to df_poil dataframe for completeness df_avgSD = df_poil.join(df_avgSD, df_poil.POIID == df_avgSD.POI, how = 'Left').drop(df_avgSD.POI) print('Display distance Average and Std_Dev for each POI') df_avgSD.show() print ("Note: Based on above output, it can be concluded that POI2 radius of influence is ZERO\n") # **Note:** Based on above output, it can be concluded that POI2 radius of influence is ZERO # In[71]: #The radius of Influence-Circle of POI will be the distance to farthest assigned request w = Window.partitionBy('POI') df_radius = df_poi.withColumn('max_r', F.max('POI_DIS').over(w)) .where(F.col('POI_DIS') == F.col('max_r')) .drop('max_r') #Left Join df_radius dataframe to df_poil dataframe for completeness df_avgSD_r = df_avgSD.join(df_radius['POI', 'POI_DIS'], df_avgSD.POIID == df_radius.POI, how = 'Left') .drop(df_radius.POI) .withColumnRenamed('POI_DIS', 'POI_RADIUS') print('Display the maximum POI_DIS (i.e, POI_RADIUS) values for each group\n') df_avgSD_r.show() # In[72]: #Calculate number of requests for each POI df_no_of_req = df_poi.groupby('POI').agg(F.count('POI').alias('Requests')) #Append POI_No. df_poi_req = df_avgSD_r.join(df_no_of_req, df_avgSD_r.POIID == df_no_of_req.POI, 'Left' ) .drop(df_no_of_req['POI']) #Calculate the density df_poi_density = df_poi_req.withColumn('Density', F.col('Requests')/ (3.14*F.col('POI_RADIUS')**2 )) print('Dislay No. of Requests and Density for each POI') df_poi_density.show() print ("\n------------------------END OF ANSWER #3------------------------\n") # **End of Answer #3** # # --- # ### 4. Data Science/Engineering Tracks # Please complete either 4a or 4b. Extra points will be awarded for completing both tasks. # # #### 4a. Model # To visualize the popularity of each POI, they need to be mapped to a scale that ranges from -10 to 10. Please provide a mathematical model to implement this, taking into consideration of extreme cases and outliers. Aim to be more sensitive around the average and provide as much visual differentiability as possible. # Bonus: Try to come up with some reasonable hypotheses regarding POIs, state all assumptions, testing steps and conclusions. Include this as a text file (with a name bonus) in your final submission. # In[61]: #Import PySpark Libraries for Data Analytics from pyspark.ml.feature import MinMaxScaler from pyspark.ml.feature import VectorAssembler from pyspark.ml import Pipeline # In[88]: df_poi_density_temp = df_poi_density.filter(df_poi_density.Density.isNotNull()) #df_poi_density_temp.show() # In[109]: # Spark-udf for converting column from vector type to double type myfun_vec2double = F.udf(lambda x: round(float(list(x)[0]),3), DoubleType()) # Use Spark VectorAssembler Transformation - Converting column to vector type assembler = VectorAssembler(inputCols=['Density'],outputCol="Density_Vector") # Use Spark MinMaxScaler Transformation to scale the column within (min,max) range scaler = MinMaxScaler(min = -10, max = 10, inputCol="Density_Vector", outputCol="Density_Scaled") # Create a Spark Pipeline of VectorAssembler and MinMaxScaler pipeline = Pipeline(stages=[assembler, scaler]) #Drop POI2 as outlier df_poi_density_temp = df_poi_density.filter(df_poi_density.Density.isNotNull()) # Spark fitting pipeline on dataframe df_norm = pipeline.fit(df_poi_density_temp).transform(df_poi_density_temp).withColumn("Density_Scaled", myfun_vec2double("Density_Scaled")).drop("Density_Vector") print('Display scaled density for each POI') df_norm.select(*['POIID'], *[F.round(c, 3).alias(c) for c in df_norm.columns[1:] ]).show() # In[112]: df_lognorm = df_norm.withColumn('log_Density', F.log10(F.col('Density')) ) # Use Spark VectorAssembler Transformation - Converting column to vector type assembler_log = VectorAssembler(inputCols=['log_Density'],outputCol="log_Density_Vector") # Use Spark MinMaxScaler Transformation to scale the column within (min,max) range scaler_log = MinMaxScaler(min = -1.0, max = 1.0, inputCol="log_Density_Vector", outputCol="log_Density_Scaled") # Create a Spark Pipeline of VectorAssembler and MinMaxScaler pipeline_log = Pipeline(stages=[assembler_log, scaler_log]) # Spark fitting pipeline on dataframe df_lognorm = pipeline_log.fit(df_lognorm).transform(df_lognorm) .withColumn("log_Density_Scaled", myfun_vec2double("log_Density_Scaled")) .drop("log_Density_Vector") print('Display scaled log_density for each POI') df_lognorm.select(*['POIID'], *[F.round(c, 3).alias(c) for c in df_lognorm.columns[1:] ]).show() #Save the interpretation on results in 'bonus' file bonus = """ Interpretation: Density column is the ratio of Requests to POI_Area. log_Density was calculated by taking log10 of Density values. log_Density were scaled in range (-10,10) to calculate log_Density_Scaled. It is difficult to come up with a statitics with only 3 good POIs. Nonetheless, the density values of POI1 and POI3 are 3 orders higher than POI4. Hence, Density_Scaled, log_Density and log_Density_Scaled values are also skewed. POI1 and POI3 attract more customers or requests per unit area of influence. Assumptions: POI2 was dropped as outlier. POI2 data must be investigated to identify the cause of zero zone of influence. Bad data collection and formatting can be reasons for POI2 being outlier """ with open('bonus', 'w') as f: f.write(bonus) f.close() # # **Interpretation:** # Density column is the ratio of Requests to POI_Area. log_Density was calculated by taking log10 of Density values. log_Density were scaled in range (-10,10) to calculate log_Density_Scaled. # # It is difficult to come up with a statitics with only 3 good POIs. # # Nonetheless, the density values of POI1 and POI3 are 3 orders higher than POI4. Hence, Density_Scaled, log_Density and log_Density_Scaled values are also skewed. # POI1 and POI3 attract more customers or requests per unit area of influence. # # **Assumptions:** POI2 was dropped as outlier. POI2 data must be investigated to identify the cause of zero zone of influence. Bad data collection and formatting can be reasons for POI2 being outlier print ("\n------------------------END OF ANSWER #4a------------------------\n") # **End of Answer #4a** # # ---- # #### 4b. Pipeline Dependency # We use a modular design on all of our data analysis tasks. To get to a final product, we organize steps using a data pipeline. One task may require the output of one or multiple other tasks to run successfully. This creates dependencies between tasks. # # We also require the pipeline to be flexible. This means a new task may enter a running pipeline anytime that may not have the tasks' dependencies satisfied. In this event, we may have a set of tasks already running or completed in the pipeline, and we will need to map out which tasks are prerequisites for the newest task so the pipeline can execute them in the correct order. For optimal pipeline execution, when we map out the necessary tasks required to execute the new task, we want to avoid scheduling tasks that have already been executed. # # If we treat each task as a node and the dependencies between a pair of tasks as directed edges, we can construct a DAG (Wiki: Directed Acyclic Graph). # # Consider the following scenario. At a certain stage of our data processing, we have a set of tasks (starting tasks) that we know all its prerequisite task has been executed, and we wish to reach to a later goal task. We need to map out a path that indicates the order of executions on tasks that finally leads to the goal task. We are looking for a solution that satisfies both necessity and sufficiency -- if a task is not a prerequisite task of goal, or its task is a prerequisite task for starting tasks (already been executed), then it shouldn't be included in the path. The path needs to follow a correct topological ordering of the DAG, hence a task needs to be placed behind all its necessary prerequisite tasks in the path. # # Note: A starting task should be included in the path, if and only if it's a prerequisite of the goal task # # For example, we have 6 tasks [A, B, C, D, E, F], C depends on A (denoted as A->C), B->C, C->E, E->F. A new job has at least 2 tasks and at most 6 tasks, each task can only appear once. # # Examples: # # Inputs: starting task: A, goal task: F, output: A,B,C,E,F or B,A,C,E,F. # Input: starting task: A,C, goal task:'F', outputs: C,E,F. # You will find the starting task and the goal task in question.txt file, list of all tasks in task_ids.txt and dependencies in relations.txt. # # Please submit your implementation and result. # In[113]: #Assign questions data questions = {'starting task': '73', 'goal task': '36'} questions # In[114]: #Assign relations data relations = [(97,102), (75,31), (75,37), (100,20), (102,36), (102,37), (102,31), (16,37), (39,73), (39,100), (41,73), (41,112), (62,55), (112,97), (20,94), (20,97), (21,20), (73,20), (56,102), (56,75), (56,55), (55,31), (55,37), (94,56), (94,102)] # In[117]: #Assign Task-IDs data task_ids = [97,75,100,102,16,39,41,62,112,20,21,73,56,55,36,37,94,31] # In[118]: #Create a pandas-dataframe of relations data r_pd = pd.DataFrame(relations, columns = ['from', 'to']) #r_pd.head() # In[119]: #Get starting target (st) and goal target (gt) st = int(questions['starting task']); print ('Starting Task: %2d'%(st)) gt = int(questions['goal task']); print ('Goal Task: %2d'%(gt)) # In[171]: #A python recursive function to find the path from source to target def replicate_recur(st, gt, mylist=None): # If a list has not been passed as argument create an empty one if(mylist == None): mylist = [st] if st == gt: return mylist temp = r_pd[r_pd['from'] == st].values if not temp.any() : temp = 'Error' mylist.append(temp) return mylist mylist = [ [i for i in mylist] for _ in range(len(temp))] for idx,val in enumerate(temp[:,1]): mylist[idx].append(val) mylist[idx] = replicate_recur(val, gt, mylist[idx]) return mylist output = [] def removeNestings(l): for i in l: if (type(i) == list) & (type(i[0]) == list): removeNestings(i) elif ('Error' not in i): output.append(i) print ('\nThe different paths from Starting Target to Goal Target\n') removeNestings([replicate_recur(st, gt)]) pprint(output) print ("\n------------------------END OF ANSWER #4b------------------------\n")
# MAGIC %md # MAGIC # MAGIC #### Run Quality Control # MAGIC # MAGIC Perform variant-wise filtering on Hardy-Weinberg equilibrium P-values and allele frequency # COMMAND ---------- hwe = (spark.read.format("delta").load(delta_silver_path).where( (fx.col("alleleFrequencies").getItem(0) >= allele_freq_cutoff) & (fx.col("alleleFrequencies").getItem(0) <= (1.0 - allele_freq_cutoff))).withColumn( "log10pValueHwe", fx.when(fx.col("pValueHwe") == 0, 26).otherwise(-fx.log10(fx.col("pValueHwe"))))) # COMMAND ---------- hwe_cutoff = calculate_pval_bonferroni_cutoff(hwe) mlflow.log_param("Hardy-Weinberg P value cutoff", hwe_cutoff) # COMMAND ---------- display( plot_histogram(df=hwe.select("log10pValueHwe"), col="log10pValueHwe", xlabel='-log_{10}(P)', xmin=0, xmax=25, nbins=50,
# COMMAND ---------- # MAGIC %md # MAGIC # MAGIC #### Run Quality Control # MAGIC # MAGIC Perform variant-wise filtering on Hardy-Weinberg equilibrium P-values and allele frequency # COMMAND ---------- hwe = (spark.read.format("delta") .load(delta_silver_path) .where((fx.col("alleleFrequencies").getItem(0) >= allele_freq_cutoff) & (fx.col("alleleFrequencies").getItem(0) <= (1.0 - allele_freq_cutoff))) .withColumn("log10pValueHwe", fx.when(fx.col("pValueHwe") == 0, 26).otherwise(-fx.log10(fx.col("pValueHwe"))))) # COMMAND ---------- hwe_cutoff = calculate_pval_bonferroni_cutoff(hwe) mlflow.log_param("Hardy-Weinberg P value cutoff", hwe_cutoff) # COMMAND ---------- display(plot_histogram(df=hwe.select("log10pValueHwe"), col="log10pValueHwe", xlabel='-log_{10}(P)', xmin=0, xmax=25, nbins=50, plot_title="hardy-weinberg equilibrium",
def createTrans08(sparkDF): # =========================== # douglas fletcher # purpose: create data # transformations (10 at a time) # input: # sparkDF type sparkDF # output: # sparkDFTrans type sparkDF # =========================== sparkDFTrans = sparkDF sparkDFTrans = sparkDFTrans.select("*" , (sparkDFTrans.LogNumberOfTimes90DaysLate - sparkDFTrans.LogNumberOfTime6089DaysPastDueNotWorse) .alias("LogRatio90to6089DaysLate") ) sparkDFTrans = sparkDFTrans.select("*" , when(sparkDFTrans.NumberOfOpenCreditLinesAndLoans > 0, 1).otherwise(0) .alias("AnyOpenCreditLinesOrLoans") ) sparkDFTrans = sparkDFTrans.select("*" , (log10(sparkDFTrans.NumberOfOpenCreditLinesAndLoans)) .alias("LogNumberOfOpenCreditLinesAndLoans") ) sparkDFTrans = sparkDFTrans.withColumn("LogNumberOfOpenCreditLinesAndLoans" , when(sparkDFTrans.LogNumberOfOpenCreditLinesAndLoans.isNull(), 0) .otherwise(sparkDFTrans.LogNumberOfOpenCreditLinesAndLoans) ) sparkDFTrans = sparkDFTrans.select("*" , (sparkDFTrans.LogNumberOfOpenCreditLinesAndLoans - log1p(sparkDFTrans.NumberOfDependents)) .alias("LogNumberOfOpenCreditLinesAndLoansPerPerson") ) sparkDFTrans = sparkDFTrans.select("*" , when(sparkDFTrans.NumberOfDependents > 0, 1).otherwise(0) .alias("HasDependents") ) sparkDFTrans = sparkDFTrans.select("*" , log1p(sparkDFTrans.NumberOfDependents) .alias("LogHouseholdSize") ) sparkDFTrans = sparkDFTrans.drop(sparkDFTrans.NumberOfDependents) sparkDFTrans = sparkDFTrans.select("*" , log10(sparkDFTrans.DebtRatio) .alias("LogDebtRatio") ) sparkDFTrans = sparkDFTrans.withColumn("LogDebtRatio" , when(sparkDFTrans.LogDebtRatio.isNull(), 0) .otherwise(sparkDFTrans.LogDebtRatio) ) sparkDFTrans = sparkDFTrans.drop(sparkDFTrans.DebtRatio) sparkDFTrans = sparkDFTrans.select("*" , (sparkDFTrans.LogDebt - log1p(sparkDFTrans.NumberOfTimesPastDue)) .alias("LogDebtPerDelinquency") ) sparkDFTrans = sparkDFTrans.select("*" , (sparkDFTrans.LogDebt - log1p(sparkDFTrans.NumberOfTimes90DaysLate)) .alias("LogDebtPer90DaysLate") ) sparkDFTrans = sparkDFTrans.select("*" , (log10(sparkDFTrans.UnknownIncomeDebtRatio)) .alias("LogUnknownIncomeDebtRatio") ) sparkDFTrans = sparkDFTrans.withColumn("LogUnknownIncomeDebtRatio" , when(sparkDFTrans.LogUnknownIncomeDebtRatio.isNull(), 0) .otherwise(sparkDFTrans.LogUnknownIncomeDebtRatio) ) sparkDFTrans = sparkDFTrans.select("*" , (sparkDFTrans.LogUnknownIncomeDebtRatio - sparkDFTrans.LogHouseholdSize) .alias("LogUnknownIncomeDebtRatioPerPerson") ) return sparkDFTrans
def getTrendRows(df, attrbs, targColumn, targVal, recordsNo, targVals_counts, depth, parent_attrb): trends = [] #print parent_attrb #print depth #Edit this for number of recursive levels if depth == 3: return for attrb in attrbs: df_t = df.groupBy(attrb).pivot(targColumn).count() df_t = df_t.select(attrb, str(targVal)) df_t = df_t.withColumn('Support_' + str(targVal), col(str(targVal)) / float(recordsNo)) df_t = df_t.withColumn( 'Confidence_' + str(targVal), col(str(targVal)) / float(targVals_counts[targVal])) df_t = df_t.withColumn( 'Lift_' + str(targVal), col('Confidence_' + str(targVal)) / (targVals_counts[targVal] / float(recordsNo))) df_t = df_t.withColumn( 'Metric_' + str(targVal), col('Support_' + str(targVal)) * log10(col('Lift_' + str(targVal)))) scores = df_t.collect() scores = [[attrb] + list(x) for x in scores] trends = trends + scores #Adjust [:3] for top x rules trends = sorted(trends, key=lambda x: x[-1], reverse=True)[:3] for trend in trends: df_e = df.filter(col(trend[0]) == trend[1]) df_e = df_e.drop(trend[0]) tmp_attrbs = attrbs #print tmp_attrbs #print trend tmp_attrbs = [x for x in attrbs if x != trend[0]] parent_attrb.append(str(trend[0]) + ' == ' + str(trend[1])) #Getting row to be inserted in dataframe z = parent_attrb + trend[-5:] z[0:-5] = [' & '.join(z[0:-5])] global_trends.append(z) getTrendRows(df_e, tmp_attrbs, targColumn, targVal, recordsNo, targVals_counts, depth + 1, parent_attrb) parent_attrb.pop() return global_trends
df2 = df2.withColumn("text_entry", split("text_entry", " ")) #Explode eachtext_entry value into multiple rows to get _id with each word of text_entry df2 = df2.withColumn("token", explode(col("text_entry"))) #Calculating Term Frequency by grouping based on ‘_id’ and ‘token’ and counting how many times each token occurs in each document df_tf = df2.groupby("_id", "token").agg(F.count("text_entry").alias("tf")) #Calculating Document Frequency by grouping on each token and counting the number of documents it occurs in df_idf = df2.groupby("token").agg(F.countDistinct("_id").alias("df")) #Converting ‘df’ column to Double Type in order for easy calculation later on df_idf = df_idf.withColumn("df", df_idf["df"].cast(DoubleType())) #Calculating IDF values df_idf = df_idf.withColumn("idf", F.log10(N/df_idf["df"])) #Joining df_tf and df_idf based on token columns tokensWithTfIdf = df_tf.join(df_idf, df_tf["token"] == df_idf["token"], how='left').drop(df_idf["token"]) #Calculating TF-IDF Score tokensWithTfIdf = tokensWithTfIdf.withColumn("tf_idf", col("tf") * col("idf")) #Change ordering of Columns & Caching the Inverted Index tokensWithTfIdf = tokensWithTfIdf.select("token", "_id", "tf", "df", "idf", "tf_idf") print("\n") #Showing the top 20 rows of the Inverted Index tokensWithTfIdf.show() #Caching the Inverted Index for further usage
def parse_az_phewas_evidence(az_phewas_df: DataFrame) -> DataFrame: """ Parse Astra Zeneca's PheWAS Portal evidence. Args: az_phewas_df: DataFrame with Astra Zeneca's PheWAS Portal data Returns: evd_df: DataFrame with Astra Zeneca's data following the t/d evidence schema. """ to_keep = [ 'datasourceId', 'datatypeId', 'allelicRequirements', 'targetFromSourceId', 'diseaseFromSource', 'diseaseFromSourceMappedId', 'pValueMantissa', 'pValueExponent', 'beta', 'betaConfidenceIntervalLower', 'betaConfidenceIntervalUpper', 'oddsRatio', 'oddsRatioConfidenceIntervalLower', 'oddsRatioConfidenceIntervalUpper', 'resourceScore', 'ancestry', 'ancestryId', 'literature', 'projectId', 'cohortId', 'studySampleSize', 'studyCases', 'studyCasesWithQualifyingVariants', 'statisticalMethod', 'statisticalMethodOverview', ] return (az_phewas_df.withColumn( 'datasourceId', lit('gene_burden')).withColumn( 'datatypeId', lit('genetic_association')).withColumn( 'literature', array(lit('34375979'))).withColumn( 'projectId', lit('AstraZeneca PheWAS Portal')).withColumn( 'cohortId', lit('UK Biobank 450k')).withColumnRenamed( 'Gene', 'targetFromSourceId'). withColumnRenamed('Phenotype', 'diseaseFromSource').join( import_trait_mappings(), on='diseaseFromSource', how='left', ).withColumn('resourceScore', col('pValue')).withColumn( 'pValueExponent', log10(col('pValue')).cast(IntegerType()) - lit(1)).withColumn( 'pValueMantissa', round( col('pValue') / pow(lit(10), col('pValueExponent')), 3)).withColumn( 'beta', when(col('Type') == 'Quantitative', col('beta')), ).withColumn( 'betaConfidenceIntervalLower', when(col('Type') == 'Quantitative', col('LCI')), ).withColumn( 'betaConfidenceIntervalUpper', when(col('Type') == 'Quantitative', col('UCI')), ).withColumn( 'oddsRatio', when(col('Type') == 'Binary', col('binOddsRatio')), ).withColumn( 'oddsRatioConfidenceIntervalLower', when(col('Type') == 'Binary', col('LCI')), ).withColumn( 'oddsRatioConfidenceIntervalUpper', when(col('Type') == 'Binary', col('UCI')), ).withColumn('ancestry', lit('EUR')).withColumn( 'ancestryId', lit('HANCESTRO_0005')).withColumnRenamed( 'nSamples', 'studySampleSize').withColumnRenamed( 'nCases', 'studyCases').withColumnRenamed( 'nCasesQV', 'studyCasesWithQualifyingVariants'). withColumnRenamed( 'CollapsingModel', 'statisticalMethod').withColumn( 'statisticalMethodOverview', col('statisticalMethod')).replace( to_replace=METHOD_DESC, subset=['statisticalMethodOverview']).withColumn( 'allelicRequirements', when( col('statisticalMethod') == 'rec', array(lit('recessive'))).otherwise( array(lit('dominant'))), ).select(to_keep).distinct())
def tocolumns(df, expr): import pyspark.sql.functions as fcns if isinstance(expr, histbook.expr.Const): return fcns.lit(expr.value) elif isinstance(expr, (histbook.expr.Name, histbook.expr.Predicate)): return df[expr.value] elif isinstance(expr, histbook.expr.Call): if expr.fcn == "abs" or expr.fcn == "fabs": return fcns.abs(tocolumns(df, expr.args[0])) elif expr.fcn == "max" or expr.fcn == "fmax": return fcns.greatest(*[tocolumns(df, x) for x in expr.args]) elif expr.fcn == "min" or expr.fcn == "fmin": return fcns.least(*[tocolumns(df, x) for x in expr.args]) elif expr.fcn == "arccos": return fcns.acos(tocolumns(df, expr.args[0])) elif expr.fcn == "arccosh": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "arcsin": return fcns.asin(tocolumns(df, expr.args[0])) elif expr.fcn == "arcsinh": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "arctan2": return fcns.atan2(tocolumns(df, expr.args[0]), tocolumns(df, expr.args[1])) elif expr.fcn == "arctan": return fcns.atan(tocolumns(df, expr.args[0])) elif expr.fcn == "arctanh": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "ceil": return fcns.ceil(tocolumns(df, expr.args[0])) elif expr.fcn == "copysign": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "cos": return fcns.cos(tocolumns(df, expr.args[0])) elif expr.fcn == "cosh": return fcns.cosh(tocolumns(df, expr.args[0])) elif expr.fcn == "rad2deg": return tocolumns(df, expr.args[0]) * (180.0 / math.pi) elif expr.fcn == "erfc": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "erf": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "exp": return fcns.exp(tocolumns(df, expr.args[0])) elif expr.fcn == "expm1": return fcns.expm1(tocolumns(df, expr.args[0])) elif expr.fcn == "factorial": return fcns.factorial(tocolumns(df, expr.args[0])) elif expr.fcn == "floor": return fcns.floor(tocolumns(df, expr.args[0])) elif expr.fcn == "fmod": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "gamma": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "hypot": return fcns.hypot(tocolumns(df, expr.args[0]), tocolumns(df, expr.args[1])) elif expr.fcn == "isinf": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "isnan": return fcns.isnan(tocolumns(df, expr.args[0])) elif expr.fcn == "lgamma": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "log10": return fcns.log10(tocolumns(df, expr.args[0])) elif expr.fcn == "log1p": return fcns.log1p(tocolumns(df, expr.args[0])) elif expr.fcn == "log": return fcns.log(tocolumns(df, expr.args[0])) elif expr.fcn == "pow": return fcns.pow(tocolumns(df, expr.args[0]), tocolumns(df, expr.args[1])) elif expr.fcn == "deg2rad": return tocolumns(df, expr.args[0]) * (math.pi / 180.0) elif expr.fcn == "sinh": return fcns.sinh(tocolumns(df, expr.args[0])) elif expr.fcn == "sin": return fcns.sin(tocolumns(df, expr.args[0])) elif expr.fcn == "sqrt": return fcns.sqrt(tocolumns(df, expr.args[0])) elif expr.fcn == "tanh": return fcns.tanh(tocolumns(df, expr.args[0])) elif expr.fcn == "tan": return fcns.tan(tocolumns(df, expr.args[0])) elif expr.fcn == "trunc": raise NotImplementedError( expr.fcn) # FIXME (fcns.trunc is for dates) elif expr.fcn == "xor": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "conjugate": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "exp2": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "heaviside": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "isfinite": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "left_shift" and isinstance(expr.args[1], histbook.expr.Const): return fcns.shiftLeft(tocolumns(df, expr.args[0]), expr.args[1].value) elif expr.fcn == "log2": return fcns.log2(tocolumns(df, expr.args[0])) elif expr.fcn == "logaddexp2": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "logaddexp": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "mod" or expr.fcn == "fmod": return tocolumns(df, expr.args[0]) % tocolumns(df, expr.args[1]) elif expr.fcn == "right_shift" and isinstance(expr.args[1], histbook.expr.Const): return fcns.shiftRight(tocolumns(df, expr.args[0]), expr.args[1].value) elif expr.fcn == "rint": return fcns.rint(tocolumns(df, expr.args[0])) elif expr.fcn == "sign": raise NotImplementedError(expr.fcn) # FIXME elif expr.fcn == "where": return fcns.when(tocolumns(df, expr.args[0]), tocolumns(df, expr.args[1])).otherwise( tocolumns(df, expr.args[2])) elif expr.fcn == "numpy.equal": return tocolumns(df, expr.args[0]) == tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.not_equal": return tocolumns(df, expr.args[0]) != tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.less": return tocolumns(df, expr.args[0]) < tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.less_equal": return tocolumns(df, expr.args[0]) <= tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.isin": return tocolumns(df, expr.args[0]) in tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.logical_not": return ~tocolumns(df, expr.args[0]) elif expr.fcn == "numpy.add": return tocolumns(df, expr.args[0]) + tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.subtract": return tocolumns(df, expr.args[0]) - tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.multiply": return tocolumns(df, expr.args[0]) * tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.true_divide": return tocolumns(df, expr.args[0]) / tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.logical_or": return tocolumns(df, expr.args[0]) | tocolumns(df, expr.args[1]) elif expr.fcn == "numpy.logical_and": return tocolumns(df, expr.args[0]) & tocolumns(df, expr.args[1]) else: raise NotImplementedError(expr.fcn) else: raise AssertionError(expr)
def compile_log10(t, expr, scope, **kwargs): op = expr.op() src_column = t.translate(op.arg, scope) return F.log10(src_column)
how="left_outer") # COMMAND ---------- #check if demographics are needed if demographics: print 'generating demographics' demographicTbl = generate_demographic_info(orderUserInfoDF, userCols) entityInfoDF = entityInfoDF.join(demographicTbl, entity, how='left_outer') # COMMAND ---------- # finally add bunch of log values for selected cols cols_for_log = [ x for x, y in entityInfoDF.dtypes if any(map(x.startswith, summaryOps['logs'])) if y in ['double', 'float', 'int', 'long', 'bigint'] ] print "gen log for cols", cols_for_log for col in cols_for_log: entityInfoDF = entityInfoDF.withColumn( "log_" + col, F.when(entityInfoDF[col] > 0, F.log10(col)).otherwise(0)) # COMMAND ---------- entityInfoDF = entityInfoDF.cache() entityInfoDF.repartition(1).write.mode("overwrite").format('com.databricks.spark.csv') \ .options(header='true', mode="overwrite") \ .save('/mnt/' + AWS_BUCKET_NAME + '/' + destEntityInfoPath)