def text_features(p_df): """ Extracts features derived from the quora question texts. :param p_df: A DataFrame. :return: A DataFrame. """ diff_len = udf(lambda arr: arr[0] - arr[1], IntegerType()) common_words = udf(lambda arr: len(set(arr[0]).intersection(set(arr[1]))), IntegerType()) unique_chars = udf(lambda s: len(''.join(set(s.replace(' ', '')))), IntegerType()) p_df = p_df.withColumn("len_q1", length("question1")).withColumn("len_q2", length("question2")) p_df = p_df.withColumn("diff_len", diff_len(array("len_q1", "len_q2"))) p_df = p_df.withColumn("words_q1", size("question1_words")).withColumn("words_q2", size("question2_words")) p_df = p_df.withColumn("common_words", common_words(array("question1_words", "question2_words"))) p_df = p_df.withColumn( "unique_chars_q1", unique_chars("question1") ).withColumn("unique_chars_q2", unique_chars("question2")) assembler = VectorAssembler( inputCols=["len_q1", "len_q2", "diff_len", "words_q1", "words_q2", "common_words", "unique_chars_q1", "unique_chars_q2"], outputCol="text_features" ) p_df = assembler.transform(p_df) return p_df
def data(self): from pyspark.sql.functions import array, explode, col, lit return self.spark.range(10).toDF('id') \ .withColumn("vs", array([lit(i * 1.0) + col('id') for i in range(20, 30)])) \ .withColumn("v", explode(col('vs'))) \ .drop('vs') \ .withColumn('w', lit(1.0))
def test_vectorized_udf_basic(self): from pyspark.sql.functions import pandas_udf, col, array df = self.spark.range(10).select( col('id').cast('string').alias('str'), col('id').cast('int').alias('int'), col('id').alias('long'), col('id').cast('float').alias('float'), col('id').cast('double').alias('double'), col('id').cast('decimal').alias('decimal'), col('id').cast('boolean').alias('bool'), array(col('id')).alias('array_long')) f = lambda x: x str_f = pandas_udf(f, StringType()) int_f = pandas_udf(f, IntegerType()) long_f = pandas_udf(f, LongType()) float_f = pandas_udf(f, FloatType()) double_f = pandas_udf(f, DoubleType()) decimal_f = pandas_udf(f, DecimalType()) bool_f = pandas_udf(f, BooleanType()) array_long_f = pandas_udf(f, ArrayType(LongType())) res = df.select(str_f(col('str')), int_f(col('int')), long_f(col('long')), float_f(col('float')), double_f(col('double')), decimal_f('decimal'), bool_f(col('bool')), array_long_f('array_long')) self.assertEquals(df.collect(), res.collect())
def test_manual(self): df = self.data sum_udf = self.pandas_agg_sum_udf mean_udf = self.pandas_agg_mean_udf mean_arr_udf = pandas_udf( self.pandas_agg_mean_udf.func, ArrayType(self.pandas_agg_mean_udf.returnType), self.pandas_agg_mean_udf.evalType) result1 = df.groupby('id').agg( sum_udf(df.v), mean_udf(df.v), mean_arr_udf(array(df.v))).sort('id') expected1 = self.spark.createDataFrame( [[0, 245.0, 24.5, [24.5]], [1, 255.0, 25.5, [25.5]], [2, 265.0, 26.5, [26.5]], [3, 275.0, 27.5, [27.5]], [4, 285.0, 28.5, [28.5]], [5, 295.0, 29.5, [29.5]], [6, 305.0, 30.5, [30.5]], [7, 315.0, 31.5, [31.5]], [8, 325.0, 32.5, [32.5]], [9, 335.0, 33.5, [33.5]]], ['id', 'sum(v)', 'avg(v)', 'avg(array(v))']) self.assertPandasEqual(expected1.toPandas(), result1.toPandas())
def test_smvArrayFlatten(self): df = self.createDF('a:String;b:String;c:String', ',,;1,2,;2,3,4') df1 = df.select(F.array( F.array(F.lit(None), F.col('a')), F.array(F.col('a'), F.col('b'), F.col('c')) ).alias('aa')) res1 = df1.select(F.col('aa').smvArrayFlatten(StringType()).alias('a'))\ .select(SF.smvArrayCat('|', F.col('a')).alias('k')) exp = self.createDF("k: String", """||||; |1|1|2|; |2|2|3|4""") res2 = df1.select(F.col('aa').smvArrayFlatten(df1).alias('a'))\ .select(SF.smvArrayCat('|', F.col('a')).alias('k')) self.should_be_same(res1, exp) self.should_be_same(res2, exp)
def test_array_type_correct(self): df = self.data.withColumn("arr", array(col("id"))).repartition(1, "id") output_schema = StructType( [StructField('id', LongType()), StructField('v', IntegerType()), StructField('arr', ArrayType(LongType()))]) udf = pandas_udf( lambda pdf: pdf, output_schema, PandasUDFType.GROUPED_MAP ) result = df.groupby('id').apply(udf).sort('id').toPandas() expected = df.toPandas().groupby('id').apply(udf.func).reset_index(drop=True) self.assertPandasEqual(expected, result)
def solveCNF(path="", startLevel=10): # 20 Vars - uf20-01.cnf # 4726 sol. 7714 nodes # 50 Vars - uf50-05.cnf if path == "": path = path expandids = udf(expandLevel, ArrayType(LongType())) graphDF = sc.parallelize([]) (numVar, numClauses, clauses) = parseNPFunction(path) initData = [i for i in range(0, 2**(startLevel))] initRDD = sc.parallelize(initData) initDF = initRDD.map(lambda x: Row(id=long(x), level=startLevel)) graphDF = sqlContext.createDataFrame(initDF) for newvar in range(startLevel + 1, numVar + 1): walkerDF = graphDF.select("id").filter( col("level") == startLevel).withColumn("state", lit(3)) for walklevel in range(startLevel, newvar): if newvar == startLevel + 1: reducedClause = minimClause(clauses, walklevel, newvar, onlyNewVariable=False) else: reducedClause = minimClause(clauses, walklevel, newvar) #Parse state with clauses walkerDF = walkerDF.rdd.map(lambda row: Row( id=row['id'], state=updateState(reducedClause, row['id'], row['state'], walklevel, newvar))).toDF() #Creates hash = id mod 2^level in graphDF to delete all branches of a no solution graphDF = graphDF.select("*").withColumn( "hashID", graphDF['id'] % (2**walklevel)) #joins graphDF with updted status walkerDF and deletes all non solutions at this level, with new variable (marked by state=-1). Puts state=0 (not used) to those ids not existing in walkingDF hashGraph = graphDF.join( walkerDF, walkerDF["id"] == graphDF["hashID"], how='leftouter').select( graphDF["id"], col("level"), col("hashID"), col("state")).fillna(0).where(col("state") != -1) #Not leaf of tree/graph if walklevel + 1 != newvar: walkerDF = hashGraph.select( "id", "state").where((hashGraph["state"] != 0) & (col("level") == (walklevel + 1))) graphDF = hashGraph.select("id", "level") #Leaf of tree/graph else: #Expand new level with the new Variable newLevelDF = walkerDF.withColumn( "id", explode( array(expandids(col("id"), col("state"), lit(newvar))))).withColumn( "level", lit(newvar)).select( explode("id").alias("id"), "level") graphDF = graphDF.select("id", "level").union(newLevelDF).cache() return graphDF
def test_smvIsAnyIn(self): df = self.createDF("k:String; v:String;", "a,b;c,d;,").select(array(col("k"), col("v")).alias("arr")) res = df.select(col("arr").smvIsAnyIn("a", "z").alias("isFound")) expected = self.createDF("isFound:Boolean", "true;false;false") self.should_be_same(expected,res)
# Note the distance to the nearest point in time leading or lagging you # Note the distance to of that nearest point to its neighbor # If you are closer to your neighbor than it is to it's closest merge and create a new point with a new outage time def timestamp_average(timestamps): seconds = 0 for i in range(0, len(timestamps)): seconds += timestamps[i] return int(seconds / len(timestamps)) max_cluster_size = 500 pw_df = pw_df.select( array("core_id").alias("core_id"), array("tx").alias("tx"), array("feeder_id").alias("feeder_id"), "outage_time", array("restore_time").alias("restore_time"), array(F.struct("location_latitude", "location_longitude")).alias("location")) pw_df = pw_df.withColumn("outage_times", F.array("outage_time")) #print("Starting with count:", pw_df.count()) pw_finalized_outages = spark.createDataFrame([], pw_df.schema) # all of the local checkpoints should probably be switched to just checkpoints # note the checkpointing is CRITICAL to the function of the algorithm in spark # otherwise the RDD lineage is recalculated every loop and the plan creation time balloons exponentially # checkpointing truncates the plan
def oversample_df(major_df_count, minor_df_count, major_df, minor_df): ratio = range(round(major_df_count / minor_df_count)) oversampled_df = minor_df.withColumn("nv",\ explode(array([lit(x) for x in ratio]))).drop('nv') combined_df = major_df.unionAll(oversampled_df) return combined_df
cancer = 'Cancer_{}'.format(i) seq = 'Seq_{}'.format(i) df_04 = df_04.withColumn( cancer, F.concat_ws('_', F.col('items').getItem(i - 1), F.col(seq))) if (i < 4): df_03 = df_03.withColumn( cancer, F.concat_ws('_', F.col('items').getItem(i - 1), F.col(seq))) if (i < 3): df_02 = df_02.withColumn( cancer, F.concat_ws('_', F.col('items').getItem(i - 1), F.col(seq))) df_02 = df_02.withColumn('items', F.array('Cancer_1', 'Cancer_2')) df_03 = df_03.withColumn('items', F.array('Cancer_1', 'Cancer_2', 'Cancer_3')) df_04 = df_04.withColumn( 'items', F.array('Cancer_1', 'Cancer_2', 'Cancer_3', 'Cancer_4')) df_02 = df_02.select('Patient_ID', 'items', 'Sex', 'Ages') df_03 = df_03.select('Patient_ID', 'items', 'Sex', 'Ages') df_04 = df_04.select('Patient_ID', 'items', 'Sex', 'Ages') data = df_02.union(df_03).union(df_04) data = data_final data = data.select('Patient_ID', 'items', 'Sex', 'Ages') for i in range(4): j = i + 1 level = 'Level_{}'.format(j) age = 'Age_{}'.format(j)
dataset = dataset.withColumn("year", (F.col("year") - min_year) / (max_year - min_year)) # Normalizamos columnas mes, día, hora, minuto y segundo dataset = dataset.withColumn("month", (F.col("month") - 1) / (12 - 1)) dataset = dataset.withColumn("day", (F.col("day") - 1) / (31 - 1)) dataset = dataset.withColumn("hour", (F.col("hour") - 0) / (23 - 0)) dataset = dataset.withColumn("minute", (F.col("minute") - 0) / (59 - 0)) dataset = dataset.withColumn("second", (F.col("second") - 0) / (59 - 0)) # Word2Vec dataset = dataset.withColumn( 'categorical', F.concat(F.array('rat'), F.array('mcc'), F.array('mnc'), F.array('msin'), F.array('tac'), F.array('snr'))) word2Vec_output_path = "{}/data/word2VecModel.bin".format(base_path) word2Vec = Word2VecModel.load(word2Vec_output_path) dataset = word2Vec.transform(dataset) # VectorAssembler sizeHint = VectorSizeHint(inputCol="vcategorical", handleInvalid="skip", size=50) dataset = sizeHint.transform(dataset) vector_assembler_output_path = "{}/data/vectorAssemblerW2VModel.bin".format( base_path)
#documentDF = sqlContext.createDataFrame([ # ("Hi I heard about Spark".split(" "), ), # ("I wish Java could use case classes".split(" "), ), # ("Logistic regression models are neat".split(" "), ) #], ["text"]) #print(documentDF.printSchema()) #print(df_raw8.printSchema()) # In[78]: import pyspark.sql.functions as F df_raw8 = df_raw8.withColumn("new_text", F.array(F.col("text"))) # In[79]: df_raw8.show() # In[80]: from pyspark.ml.feature import Word2Vec from pyspark.sql import SQLContext sqlContext = SQLContext(spark)
#print (df2_user.dtypes) df2_user_clean = df2_user.drop_duplicates( [' TimeSt', 'Country', 'Province', 'City', 'Latitude', 'Longitude']) print('length of clean dataframe: %d' % df2_user_clean.count()) #a1 = np.array(df_poi['lat']) #a2 = np.array(df_poi['long']) #a3 = np.array(df_poi['poi_id']) #array_poi = df_poi.select(array(' Latitude','Longitude','POIID')).collect() array_poi = df_poi.sort("Longitude", ascending=True).collect() #a4 = np.array(df2_user_clean['Latitude']) #a5 = np.array(df2_user_clean['Longitude']) array_userLoc = df2_user_clean.select(array('Latitude', 'Longitude')).collect() # In[150]: kdT = create_KD_Tree(array_poi) print("\n\nk-d Tree: %s" % (kdT)) # In[151]: pos_list = [] pos_dis = [] for point in array_userLoc: return_val = nn_kdtree2(point, kdT) pos_ans = return_val[0] dis_ans = haversine_dis(point, return_val)
origin_desc = "origin_desc" dest_desc = "dest_desc" airports = airports.dropDuplicates(['code']) carriers = carriers.dropDuplicates(['code']) air_car = airlines.join(carriers, airlines.carrier == carriers.code).select( [a for a in airlines.columns] + [carriers.description.alias(carrier_desc)]) print "\n\n\n" print air_car.head(1) combined = air_car.join(airports, air_car.origin == airports.code)\ .select([a for a in air_car.columns] + [airports.description.alias(origin_desc)])\ .join(airports, air_car.dest == airports.code)\ .select([a for a in air_car.columns] + [origin_desc] + [airports.description.alias(dest_desc)]) print "\n\n\n" print combined.head(2) print "\n\n\n" combined = combined.withColumn( "origin_dest_names", array(origin_desc, dest_desc)).drop(origin_desc).drop(dest_desc) combined = combined.toDF(*[a.lower() for a in combined.columns]) combined.registerTempTable("temp_table") hive_context.sql( "CREATE TABLE flight.flight_data_denorm STORED AS ORC AS SELECT * from temp_table" )
def test_score_logistic_model(spark): lvl1df = spark.read.parquet( f'{data_root}/bt_reduceded_1part.snappy.parquet') sample_blocks_df = spark.read.parquet(f'{data_root}/groupedIDs.snappy.parquet') \ .withColumn('sample_block', f.col('sample_block').cast('string')) \ .withColumn('sample_ids', f.expr('transform(sample_ids, v -> cast(v as string))')) sample_blocks = { r.sample_block: r.sample_ids for r in sample_blocks_df.collect() } with open(f'{data_root}/test_score_logistic_model.json') as json_file: test_values = json.load(json_file) map_key_pattern = ['sample_block', 'label', 'alpha_name'] reduce_key_pattern = ['header_block', 'header', 'label', 'alpha_name'] model_key_pattern = ['sample_block', 'label', 'alpha_name'] score_key_pattern = ['sample_block', 'label'] map_udf = f.pandas_udf( lambda key, pdf: map_irls_eqn(key, map_key_pattern, pdf, labeldf, sample_blocks, covdf, beta_cov_dict, maskdf, alphas), irls_eqn_struct, PandasUDFType.GROUPED_MAP) reduce_udf = f.pandas_udf( lambda key, pdf: reduce_irls_eqn(key, reduce_key_pattern, pdf), irls_eqn_struct, PandasUDFType.GROUPED_MAP) model_udf = f.pandas_udf( lambda key, pdf: solve_irls_eqn(key, model_key_pattern, pdf, labeldf, alphas, covdf), model_struct, PandasUDFType.GROUPED_MAP) score_udf = f.pandas_udf( lambda key, pdf: score_models(key, score_key_pattern, pdf, labeldf, sample_blocks, alphas, covdf, maskdf, metric='log_loss'), cv_struct, PandasUDFType.GROUPED_MAP) modeldf = lvl1df \ .withColumn('alpha_name', f.explode(f.array([f.lit(n) for n in alphas.keys()]))) \ .groupBy(map_key_pattern) \ .apply(map_udf) \ .groupBy(reduce_key_pattern) \ .apply(reduce_udf) \ .groupBy(model_key_pattern) \ .apply(model_udf) \ .withColumn('alpha_label_coef', f.expr('struct(alphas[0] AS alpha, labels[0] AS label, coefficients[0] AS coefficient)')) \ .groupBy('header_block', 'sample_block', 'header', 'sort_key', f.col('alpha_label_coef.label')) \ .agg(f.sort_array(f.collect_list('alpha_label_coef')).alias('alphas_labels_coefs')) \ .selectExpr('*', 'alphas_labels_coefs.alpha AS alphas', 'alphas_labels_coefs.label AS labels', 'alphas_labels_coefs.coefficient AS coefficients') \ .drop('alphas_labels_coefs', 'label') cvdf = lvl1df.drop('header_block', 'sort_key') \ .join(modeldf, ['header', 'sample_block'], 'right') \ .withColumn('label', f.coalesce(f.col('label'), f.col('labels').getItem(0))) \ .groupBy(score_key_pattern) \ .apply(score_udf) outdf = cvdf.filter( f'sample_block = "{test_sample_block}" AND label = "{test_label}"' ).toPandas() scores_glow = outdf['score'].to_numpy() assert (np.allclose(np.array(test_values['scores']), scores_glow))
spark .read .option("flattenInfoFields", False) .format('vcf') .load(INPUT_VCF) ) df = glow.transform("split_multiallelics", df) df.printSchema() df = df.withColumn("names", f.array([f.concat( f.col('contigName'), f.lit(":"), f.col('start') + 1, f.lit(":"), f.col('referenceAllele'), f.lit(">"), f.col('alternateAlleles')[0] )])) df.limit(10).toPandas() # + import json import shlex input_df = df.select([ f.col('contigName'), f.col('start'), f.col('end'),
df_device = df.select([col for col in df.columns if not col.startswith("rule_")]) ruleCnt = sorted([int(col.split("_")[1]) for col in df.columns if col.split("_")[0] == "rule" and col.split("_")[2] in ["reason","score","type"]])[-1] for i in range(ruleCnt+1): rule = "rule_"+str(i) df = df.withColumn("new" + rule, \ sf.concat( sf.coalesce(sf.col(rule+"_reason"),sf.lit("$$$")), sf.lit("^"), \ sf.coalesce(sf.col(rule+"_score"),sf.lit("$$$")), sf.lit("^"),\ sf.coalesce(sf.col(rule+"_type"),sf.lit("$$$")) \ ) \ ) df = df.withColumn( "ruleArray", sf.array([col for col in df.columns if col.split("_")[0] == "newrule"]) ) dfexplode = df.select(sf.col("id"),sf.col("applicantId"),sf.col("loan_application_id"),sf.col("createdDatePT"),sf.col("year"),sf.col("month"),sf.col("day") \ ,sf.explode_outer("ruleArray").alias("rule") \ ) dfsplitCol = dfexplode.withColumn("rule_reason",sf.split("rule","\^")[0]) \ .withColumn("rule_score",sf.split("rule","\^")[1]) \ .withColumn("rule_type",sf.split("rule","\^")[2]) df_rules = dfsplitCol.select(sf.col("id"),sf.col("applicantId"),sf.col("loan_application_id"),sf.col("createdDatePT"),sf.col("year"),sf.col("month"),sf.col("day"), \ sf.when(sf.col("rule_reason")=="$$$",None).otherwise(sf.col("rule_reason")).alias("rule_reason"), \ sf.when(sf.col("rule_score")=="$$$",None).otherwise(sf.col("rule_score")).cast("integer").alias("rule_score"), \ sf.when(sf.col("rule_type")=="$$$",None).otherwise(sf.col("rule_type")).alias("rule_type")) \ .where(sf.col("rule_reason").isNotNull() | sf.col("rule_score").isNotNull() | sf.col("rule_type").isNotNull())
def parse(path_to_dir): global TARGET_DIR TARGET_DIR = os.path.join(TARGET_DIR, os.path.split(path_to_dir)[1]) if 'DAS5' in os.environ: # If we want to execute it on the DAS-5 super computer print("We are on DAS5, {0} is master.".format(os.environ['HOSTNAME'] + ".ib.cluster")) spark = SparkSession.builder \ .master("spark://" + os.environ['HOSTNAME'] + ".ib.cluster:7077") \ .appName("WTA parser") \ .config("spark.executor.memory", "28G") \ .config("spark.executor.cores", "8") \ .config("spark.executor.instances", "10") \ .config("spark.driver.memory", "40G") \ .config("spark.sql.execution.arrow.enabled", "true") \ .getOrCreate() else: findspark.init(spark_home="<path to spark>") spark = SparkSession.builder \ .master("local[8]") \ .appName("WTA parser") \ .config("spark.executor.memory", "20G") \ .config("spark.driver.memory", "8G") \ .getOrCreate() if not os.path.exists(os.path.join(TARGET_DIR, Task.output_path())): print("######\nStart parsing Tasks\n######") task_df = spark.read.format('com.databricks.spark.csv').options( header='true', inferschema='true').load( os.path.join(path_to_dir, '*.csv.processed')) # Drop the pref table, saving memory and filter out unsuccessful jobs as their information is not reliable task_df = task_df.drop('pref').filter( task_df.status == ":instance.status/success").drop( 'status').cache() @F.pandas_udf(T.LongType(), F.PandasUDFType.SCALAR) def sub_two_datetimes(s1, s2): arr = [] for i in s1.keys(): d1 = datetime.datetime.strptime(s1[i], '%a %b %d %H:%M:%S %Z %Y') d2 = datetime.datetime.strptime(s2[i], '%a %b %d %H:%M:%S %Z %Y') arr.append(int((d2 - d1).total_seconds() * 1000)) return pd.Series(arr) task_df = task_df \ .withColumn('wait_time', sub_two_datetimes(F.col('submit-time'), F.col('start-time'))) \ .withColumn('runtime', sub_two_datetimes(F.col('start-time'), F.col('end-time'))) @F.pandas_udf(T.LongType(), F.PandasUDFType.SCALAR) def date_time_to_unix(series): arr = [] epoch = datetime.datetime.utcfromtimestamp(0) for i in series.keys(): arr.append( np.int64((datetime.datetime.strptime( series[i], '%a %b %d %H:%M:%S %Z %Y') - epoch).total_seconds() * 1000)) return pd.Series(arr) task_df = task_df.withColumn( 'submit-time', date_time_to_unix(F.col('submit-time'))).withColumnRenamed( 'submit-time', "ts_submit").drop('start-time').drop('end-time').cache() min_ts = task_df.agg({"ts_submit": "min"}).collect()[0][0] task_df = task_df.withColumn('ts_submit', F.col('ts_submit') - F.lit(min_ts)) @F.pandas_udf(T.DoubleType(), F.PandasUDFType.SCALAR) def convert_to_kb(v): return v * 1024 task_df = task_df.withColumn('memory', convert_to_kb( task_df.memory)).withColumnRenamed("memory", "memory_consumption") @F.pandas_udf(T.IntegerType(), F.PandasUDFType.SCALAR) def string_to_int(v): arr = [] for i in v.keys(): arr.append(mmh3.hash(v[i], signed=True)) return pd.Series(arr) @F.pandas_udf(T.LongType(), F.PandasUDFType.SCALAR) def string_to_long(v): arr = [] for i in v.keys(): arr.append(mmh3.hash64(v[i], signed=True)[0]) return pd.Series(arr) @F.pandas_udf(T.LongType(), F.PandasUDFType.SCALAR) def assign_workflow_ids(v): arr = [] for i in v.keys(): if v[i]: arr.append(mmh3.hash64(v[i], signed=True)[0]) else: arr.append( mmh3.hash64(uuid4().bytes, signed=True) [0]) # Assign a UUID, collision chance is negligible. return pd.Series(arr) task_df = task_df.withColumn('user', string_to_int( task_df.user)).withColumnRenamed("user", "user_id") task_df = task_df.withColumn('job-uuid', string_to_long( F.col('job-uuid'))).withColumnRenamed( 'job-uuid', 'task_id') type_udf = F.udf(lambda x: "Independent" if x is None else "Composite", T.StringType()) task_df = task_df.withColumn('type', type_udf(task_df.simset)) task_df = task_df.withColumn('simset', assign_workflow_ids( F.col('simset'))).withColumnRenamed( 'simset', "workflow_id") task_df = task_df.withColumnRenamed('cpu', 'resource_amount_requested') task_df = task_df.withColumnRenamed('instance', 'resource_used') # Set the static items that are not present in the trace task_df = task_df.withColumn('submission_site', F.lit(0)) task_df = task_df.withColumn('parents', F.array().cast(T.ArrayType(T.LongType()))) task_df = task_df.withColumn('children', F.array().cast(T.ArrayType(T.LongType()))) task_df = task_df.withColumn('group_id', F.lit(0)) task_df = task_df.withColumn('nfrs', F.lit("{}")) task_df = task_df.withColumn('params', F.lit("{}")) task_df = task_df.withColumn('memory_requested', F.lit(-1)) task_df = task_df.withColumn('network_io_time', F.lit(-1)) task_df = task_df.withColumn('disk_io_time', F.lit(-1)) task_df = task_df.withColumn('disk_space_requested', F.lit(-1)) task_df = task_df.withColumn('energy_consumption', F.lit(-1)) os.makedirs(os.path.join(TARGET_DIR, Task.output_path()), exist_ok=True) task_df.write.parquet(os.path.join(TARGET_DIR, Task.output_path()), mode="overwrite", compression="snappy") print("######\nDone parsing Tasks\n######") if not os.path.exists(os.path.join(TARGET_DIR, TaskState.output_path())): print("######\nStart parsing TaskState\n######") if 'task_df' not in locals(): task_df = spark.read.parquet( os.path.join(TARGET_DIR, Task.output_path())) task_state_structtype = T.StructType([ T.StructField("ts_start", T.LongType(), False), T.StructField("ts_end", T.LongType(), False), T.StructField("workflow_id", T.LongType(), False), T.StructField("task_id", T.LongType(), False), T.StructField("resource_id", T.LongType(), False), T.StructField("cpu_rate", T.DoubleType(), False), T.StructField("canonical_memory_usage", T.DoubleType(), False), T.StructField("assigned_memory", T.DoubleType(), False), T.StructField("minimum_memory_usage", T.DoubleType(), False), T.StructField("maximum_memory_usage", T.DoubleType(), False), T.StructField("disk_io_time", T.DoubleType(), False), T.StructField("maximum_disk_bandwidth", T.DoubleType(), False), T.StructField("local_disk_space_usage", T.DoubleType(), False), T.StructField("maximum_cpu_rate", T.DoubleType(), False), T.StructField("maximum_disk_io_time", T.DoubleType(), False), T.StructField("sample_rate", T.DoubleType(), False), T.StructField("sample_portion", T.DoubleType(), False), T.StructField("sampled_cpu_usage", T.DoubleType(), False), T.StructField("network_io_time", T.DoubleType(), False), T.StructField("maximum_network_bandwidth", T.DoubleType(), False), ]) @F.pandas_udf(returnType=task_state_structtype, functionType=F.PandasUDFType.GROUPED_MAP) def compute_task_states(df): workflow_id = df['workflow_id'].iloc[0] task_id = df['task_id'].iloc[0] ts_start = df['ts_submit'].min() ts_end = ts_start + df['runtime'].max() resource_id = df['resource_used'].iloc[0] cpu_rate = -1 canonical_memory_usage = df['memory_consumption'].mean() assigned_memory = -1 minimum_memory_usage = df['memory_consumption'].min() maximum_memory_usage = df['memory_consumption'].max() disk_io_time = -1 maximum_disk_bandwidth = -1 local_disk_space_usage = -1 maximum_cpu_rate = -1 maximum_disk_io_time = -1 sample_rate = -1 sample_portion = -1 sampled_cpu_usage = -1 network_io_time = -1 maximum_network_bandwidth = -1 data_dict = { "ts_start": ts_start, "ts_end": ts_end, "workflow_id": workflow_id, "task_id": task_id, "resource_id": resource_id, "cpu_rate": cpu_rate, "canonical_memory_usage": canonical_memory_usage, "assigned_memory": assigned_memory, "minimum_memory_usage": minimum_memory_usage, "maximum_memory_usage": maximum_memory_usage, "disk_io_time": disk_io_time, "maximum_disk_bandwidth": maximum_disk_bandwidth, "local_disk_space_usage": local_disk_space_usage, "maximum_cpu_rate": maximum_cpu_rate, "maximum_disk_io_time": maximum_disk_io_time, "sample_rate": sample_rate, "sample_portion": sample_portion, "sampled_cpu_usage": sampled_cpu_usage, "network_io_time": network_io_time, "maximum_network_bandwidth": maximum_network_bandwidth, } return pd.DataFrame(data_dict, index=[0]) task_state_df = task_df.groupBy(['workflow_id', 'task_id']).apply(compute_task_states) os.makedirs(os.path.join(TARGET_DIR, TaskState.output_path()), exist_ok=True) task_state_df.write.parquet(os.path.join(TARGET_DIR, TaskState.output_path()), mode="overwrite", compression="snappy") print("######\nDone parsing TaskState\n######") if not os.path.exists(os.path.join(TARGET_DIR, Resource.output_path())): print("######\nStart parsing Resources\n######") if 'task_df' not in locals(): task_df = spark.read.parquet( os.path.join(TARGET_DIR, Task.output_path())) resource_id_column = [ i.resource_used for i in task_df.select('resource_used').distinct().collect() ] resources = [] for resource_id in resource_id_column: resources.append( Resource(resource_id, 'Cluster Node', 24, '', 256, -1, -1, '').get_parquet_dict()) resource_df = pd.DataFrame(resources) os.makedirs(os.path.join(TARGET_DIR, Resource.output_path()), exist_ok=True) resource_df.to_parquet(os.path.join(TARGET_DIR, Resource.output_path(), 'part.0.parquet'), engine="pyarrow") print("######\nDone parsing Resources\n######") if not os.path.exists(os.path.join(TARGET_DIR, Workflow.output_path())): print("######\nStart parsing Workflows\n######") if 'task_df' not in locals(): task_df = spark.read.parquet( os.path.join(TARGET_DIR, Task.output_path())) workflow_structype = T.StructType([ T.StructField("id", T.LongType(), False), T.StructField("ts_submit", T.LongType(), False), T.StructField("task_count", T.IntegerType(), False), T.StructField("critical_path_length", T.LongType(), False), T.StructField("critical_path_task_count", T.IntegerType(), False), T.StructField("approx_max_concurrent_tasks", T.IntegerType(), False), T.StructField("nfrs", T.StringType(), False), T.StructField("scheduler", T.StringType(), False), T.StructField("total_resources", T.DoubleType(), False), T.StructField("total_memory_usage", T.DoubleType(), False), T.StructField("total_network_usage", T.LongType(), False), T.StructField("total_disk_space_usage", T.LongType(), False), T.StructField("total_energy_consumption", T.LongType(), False), ]) @F.pandas_udf(returnType=workflow_structype, functionType=F.PandasUDFType.GROUPED_MAP) def compute_workflow_stats(df): id = df['workflow_id'].iloc[0] ts_submit = df['ts_submit'].min() task_count = len(df) critical_path_length = -1 critical_path_task_count = -1 approx_max_concurrent_tasks = -1 nfrs = "{}" scheduler = "Cook" total_resources = df['resource_amount_requested'].sum() total_memory_usage = df['memory_consumption'].sum() total_network_usage = -1 total_disk_space_usage = -1 total_energy_consumption = -1 data_dict = { "id": id, "ts_submit": ts_submit, 'task_count': task_count, 'critical_path_length': critical_path_length, 'critical_path_task_count': critical_path_task_count, 'approx_max_concurrent_tasks': approx_max_concurrent_tasks, 'nfrs': nfrs, 'scheduler': scheduler, 'total_resources': total_resources, 'total_memory_usage': total_memory_usage, 'total_network_usage': total_network_usage, 'total_disk_space_usage': total_disk_space_usage, 'total_energy_consumption': total_energy_consumption } return pd.DataFrame(data_dict, index=[0]) workflow_df = task_df.groupBy('workflow_id').apply( compute_workflow_stats) workflow_df.explain(True) workflow_df.write.parquet(os.path.join(TARGET_DIR, Workflow.output_path()), mode="overwrite", compression="snappy") print("######\nDone parsing Workflows\n######") print("######\nStart parsing Ẁorkload\n######") pandas_task_df = pd.read_parquet(os.path.join(TARGET_DIR, Task.output_path()), engine="pyarrow") json_dict = Workload.get_json_dict_from_pandas_task_dataframe( pandas_task_df, domain="Industrial", start_date=None, end_date=None, authors=["Two Sigma"]) os.makedirs(os.path.join(TARGET_DIR, Workload.output_path()), exist_ok=True) with open( os.path.join(TARGET_DIR, Workload.output_path(), "generic_information.json"), "w") as file: # Need this on 32-bit python. def default(o): if isinstance(o, np.int64): return int(o) file.write(json.dumps(json_dict, default=default))
########################################################################################### df = spark.read.format("mongo").option('database', 'jamendo').option('collection', 'chords').load() sample_300_df=df.select(["_id","chordRatio"]).limit(songs) myFunc = f.udf(lambda array_to_list: [int(0) if e is None else int(1) for e in array_to_list], T.ArrayType(T.IntegerType())) sample_300_df2=sample_300_df.withColumn('chordRatioMinHash', myFunc('chordRatio')) df0=sample_300_df2.select(["_id", "chordRatio", "chordRatioMinHash"]) from pyspark.sql import Row from pyspark.sql.functions import col from sparkaid import flatten df0_flat=flatten(df0) columns_list1=df0_flat.columns[1:-1] array_df=df0_flat.select('_id', 'chordRatioMinHash',array(columns_list1).alias('chordRatioJS')) #fill NaNs with zeros in the array column df2_flat=df0_flat.na.fill(float(0)) columns_list2=df2_flat.columns[1:-1] array_df2=df2_flat.select('_id', 'chordRatioMinHash',array(columns_list2).alias('chordRatioJS_no_Nulls')) ### to_vector = udf(lambda a: Vectors.dense(a), VectorUDT()) data = array_df2.select('_id', 'chordRatioMinHash', "chordRatioJS_no_Nulls", to_vector("chordRatioJS_no_Nulls").alias("chordRatioWJS")) data.show(1, truncate=False) import scipy.sparse from pyspark.ml.linalg import Vectors, _convert_to_vector, VectorUDT from pyspark.sql.functions import udf, col ## from dense to sparse array
# %% #initialize chains w/ context, response, sender, author_id, next chains = data.filter((data.in_response_to_tweet_id=="none") & ~(data.response_tweet_id=="none"))\ .select( functions.col('tweet_id').alias('response'), functions.col('response_tweet_id').alias('next'), 'text', 'author_id', ) chains = chains.withColumn('context', functions.lit("").cast(types.StringType())) chains = chains.withColumn('sender', functions.lit("").cast(types.StringType())) chains = chains.withColumn('rpos', functions.lit(1).cast(types.IntegerType())) chains = chains.withColumn('tweets', functions.array().cast("array<string>")) chains.persist() #initialize empty samples DF w/ context, sender, response, author_id fields = ['sender', 'context', 'response', 'author_id'] samples_schema = types.StructType([ types.StructField(field_name, types.StringType(), True) for field_name in fields ]) samples = spark.createDataFrame([], samples_schema) #%% MAX_DEPTH = 20 depth = 0
def columns_to_array_column(N): '''N MH functions''' return array([col("_"+str(N)) for N in range(1, N+1)])
def map_annotations_cols(dataframe: DataFrame, f, columns: list, output_column: str, annotatyon_type: str, output_type: DataType = Annotation.arrayType()): """Creates a Spark UDF to map over multiple columns of Annotation results. Parameters ---------- dataframe : DataFrame Input DataFrame f : function Function to apply to the column columns : list Name of the input column output_column : str Name of the output column annotatyon_type : str Annotator type output_type : DataType, optional Output type, by default Annotation.arrayType() Returns ------- :class:`pyspark.sql.DataFrame` Transformed DataFrame Examples -------- >>> from sparknlp.pretrained import PretrainedPipeline >>> from sparknlp.functions import * >>> explain_document_pipeline = PretrainedPipeline("explain_document_dl") >>> data = spark.createDataFrame([["U.N. official Ekeus heads for Baghdad."]]).toDF("text") >>> result = explain_document_pipeline.transform(data) >>> chunks_df = map_annotations_cols( ... result, ... lambda x: [ ... Annotation("tag", a.begin, a.end, a.result, a.metadata, a.embeddings) ... for a in x ... ], ... ["pos", "ner"], ... "tags", ... "chunk" ... ) >>> chunks_df.selectExpr("explode(tags)").show(truncate=False) +-------------------------------------------+ |col | +-------------------------------------------+ |[tag, 0, 2, NNP, [word -> U.N], []] | |[tag, 3, 3, ., [word -> .], []] | |[tag, 5, 12, JJ, [word -> official], []] | |[tag, 14, 18, NNP, [word -> Epeus], []] | |[tag, 20, 24, VBZ, [word -> heads], []] | |[tag, 26, 28, IN, [word -> for], []] | |[tag, 30, 36, NNP, [word -> Baghdad], []] | |[tag, 37, 37, ., [word -> .], []] | |[tag, 0, 2, B-ORG, [word -> U.N], []] | |[tag, 3, 3, O, [word -> .], []] | |[tag, 5, 12, O, [word -> official], []] | |[tag, 14, 18, B-PER, [word -> Ekeus], []] | |[tag, 20, 24, O, [word -> heads], []] | |[tag, 26, 28, O, [word -> for], []] | |[tag, 30, 36, B-LOC, [word -> Baghdad], []]| |[tag, 37, 37, O, [word -> .], []] | +-------------------------------------------+ """ return dataframe.withColumn( output_column, map_annotations_array(f, output_type)(array(*columns)).alias( output_column, metadata={'annotatorType': annotatyon_type}))
@udf("double") def PRODUCE_agg_acc_distance_percent(xs): if xs: temp = xs[-1] return temp try: df_serve = df_serve.withColumn( "ts_acc_distance_percent", PRODUCE_ts_acc_distance_percent("ts_agg_acc_meters", "sd_len_route")) except: a = [0.0, 0.0] df_serve = df_serve.withColumn("ts_acc_distance_percent", F.array([F.lit(x) for x in a])) try: df_serve = df_serve.withColumn( "agg_acc_distance_percent", PRODUCE_agg_acc_distance_percent("ts_acc_distance_percent")) except: df_serve = df_serve.withColumn("agg_acc_distance_percent", lit(0.0)) #---------------------------------------------------------------# ''' Inference section ''' #Clustering save_path = "."
def main(spark, measure): from extract import get_sample from mapping import get_dx_codes, get_comorbidity_cols from point_ranges import range_map # Make sure measure name is uppercase to match mappings measure_name = measure.upper() # Get mappings for diagnosis and comorbidity columns codes = get_dx_codes(spark) comorb_map = get_comorbidity_cols(spark) # Get data sample, join to dx codes, and filter to measure data = get_sample(spark) data = data.join(codes, 'diagnosis_code') data = data.filter(data.measure_name == measure_name) if data.count() > 0: # Convert comorbidity columns to ints and sum them com_cols = comorb_map.get(measure_name, []) for col in com_cols: data = data.withColumn(col, (functions.upper( data[col]) == functions.lit('YES')).cast(IntegerType())) data = data.withColumn('ComorbidityScore', sum(data[x] for x in com_cols)) # Reduce dataframe to relevant columns score_cols = [ 'LengthofStay', 'ED_visits', 'ComorbidityScore', 'Inpatient_visits' ] data = data.select(['encounter_id', 'patient_nbr'] + score_cols) # Assign point values for each of the score columns for col in score_cols: pts_col = col + '_pts' # Get ranges and point vals from range config attr_range = range_map.get(col) splits = [x[1] for x in attr_range] + [float("inf")] pts = functions.array([functions.lit(x[0]) for x in attr_range]) # Transform data with bucketizer buckets = Bucketizer(splits=splits, inputCol=col, outputCol=pts_col) data = buckets.transform(data) # Turn bucket numbers into point values data = data.withColumn( pts_col, pts.getItem(data[pts_col].cast(IntegerType()))) # Get LACE score for each row data = data.withColumn('LACEScore', sum(data[x + '_pts'] for x in score_cols)) # Calculate ratio score num = data.filter(data.LACEScore > 9).count() denom = data.count() return num / float(denom) else: return None
def cal_performance(date, period, input_batch, output_batch): # 得到历史交易日 hist_dt = fetch_com_dt_hist(date) pef_horizions = { '1w': hist_dt.loc['B1W'].strftime('%Y%m%d'), '1m': hist_dt.loc['B1M'].strftime('%Y%m%d'), '3m': hist_dt.loc['B3M'].strftime('%Y%m%d'), '6m': hist_dt.loc['B6M'].strftime('%Y%m%d'), '1y': hist_dt.loc['B1Y'].strftime('%Y%m%d'), '3y': hist_dt.loc['B3Y'].strftime('%Y%m%d'), '5y': hist_dt.loc['B5Y'].strftime('%Y%m%d') } if period == 'all': start = None elif period == '1w': start = pef_horizions[period] elif period == '1m': start = pef_horizions[period] elif period == '3m': start = pef_horizions[period] elif period == '6m': start = pef_horizions[period] elif period == '1y': start = pef_horizions[period] elif period == '3y': start = pef_horizions[period] elif period == '5y': start = pef_horizions[period] ss = SparkSession \ .builder \ .appName(app_name + '_' + str(date) + '_' + period + '_' + str(is_debug)) \ .getOrCreate() ss.sparkContext.setLogLevel('WARN') # 从csv读取数据 并进行格式转换 schema = StructType([ StructField('date', TimestampType(), True), StructField('sec_id', StringType(), True), StructField('nav', FloatType(), True), StructField('ret', FloatType(), True), StructField('stock', FloatType(), True), StructField('treasury', FloatType(), True), StructField('credit', FloatType(), True), StructField('bench_ret', FloatType(), True), StructField('fnd_category', IntegerType(), True), ]) # ret_all_spark_df = ss.read.csv(data_source_csv_path + date + '/' + str(input_batch) + '/ret_all.csv', header=True, # schema=schema) ret_all_spark_df = ss.read.csv(data_source_csv_path + '20200320/1/ret_all.csv', header=True, schema=schema) # debug模式下只取部分基金 if is_debug: logging.info('use debug') # sec_id_list = ['000006JK', '000028JK', '000134JK', '000135JK'] # sec_id_list = ['005503JK', '005368JK', '004892JK', '150066JK', # '000189JK', '000270JK', '000327JK'] # sec_id_list = ['150066JK'] # sec_id_list = ['006382JK'] today_spark_df = ret_all_spark_df.filter( ret_all_spark_df.date == datetime.strptime(date, '%Y%m%d')) rank_w = Window.orderBy('sec_id') today_spark_df = today_spark_df.withColumn( 'row_no', func.row_number().over(rank_w)) today_spark_df = today_spark_df.filter( today_spark_df.row_no <= 100).select('sec_id') ret_all_spark_df = ret_all_spark_df.join(today_spark_df, on='sec_id', how='inner') # ret_all_spark_df = ret_all_spark_df[ret_all_spark_df.sec_id.isin(sec_id_list)] else: logging.info('use release') # 只取date以前的数据 切片 date start是%Y%m%d 注意转换为timestamp ret_all_spark_df = ret_all_spark_df[ ret_all_spark_df.date <= datetime.strptime(date, '%Y%m%d')] # date 小于最后一天 w = Window.partitionBy('sec_id').orderBy('date').rowsBetween( Window.unboundedPreceding, Window.unboundedFollowing) ret_all_spark_df = ret_all_spark_df.withColumn('the_last_date', func.last('date').over(w)) ret_all_spark_df = ret_all_spark_df.where( ret_all_spark_df.the_last_date >= datetime.strptime(date, '%Y%m%d')) if period == 'all': # 自定义函数 udf_mean = func.udf(lambda x: float(pd.Series(x).mean()), FloatType()) udf_std = func.udf(lambda x: float(pd.Series(x).std()), FloatType()) udf_min = func.udf(lambda x: float(pd.Series(x).min()), FloatType()) udf_max = func.udf(lambda x: float(pd.Series(x).max()), FloatType()) udf_p25 = func.udf(lambda x: float(pd.Series(x).quantile(0.25)), FloatType()) udf_median = func.udf(lambda x: float(pd.Series(x).median()), FloatType()) udf_p75 = func.udf(lambda x: float(pd.Series(x).quantile(0.75)), FloatType()) udf_skew = func.udf(lambda x: float(pd.Series(x).skew()), FloatType()) udf_kurt = func.udf(lambda x: float(pd.Series(x).kurt()), FloatType()) udf_start = func.udf(lambda x: str(x[0].strftime('%Y%m%d')), StringType()) udf_end = func.udf(lambda x: str(x[-1].strftime('%Y%m%d')), StringType()) udf_cagr = func.udf(lambda x: float(Measure.cal_cagr(pd.Series(x))), FloatType()) udf_cumret = func.udf( lambda x: float(Measure.cal_cumret(pd.Series(x))), FloatType()) udf_standard_deviation = func.udf( lambda x: float(Measure.cal_standard_deviation(pd.Series(x))), FloatType()) udf_max_drawdown = func.udf( lambda x, y: float(Measure.cal_max_drawdown(pd.Series(x, index=y)) ), FloatType()) udf_sharpe = func.udf( lambda x: float(Measure.cal_sharpe(pd.Series(x))), FloatType()) udf_downside_deviation = func.udf( lambda x: float(Measure.cal_downside_deviation(pd.Series(x))), FloatType()) udf_alpha = func.udf( lambda x, y, z, w, f: float( Measure.cal_alpha( pd.Series(x), pd.DataFrame({ 'stock': y, 'treasury': z, 'credit': w }), f)), FloatType()) udf_marketbeta = func.udf( lambda x, y: float( Measure.cal_marketbeta(pd.Series(x), pd.Series(y))), FloatType()) udf_information = func.udf( lambda x, y: float( Measure.cal_information(pd.Series(x), pd.Series(y))), FloatType()) udf_treynor = func.udf( lambda x, y: float(Measure.cal_treynor(pd.Series(x), pd.Series(y)) ), FloatType()) # 过滤基金数据长度不够 ret_all_spark_df = ret_all_spark_df.withColumn( 'fund_length', func.count('date').over(w)) ret_all_spark_df = ret_all_spark_df[ ret_all_spark_df['fund_length'] >= 2] nt_val_spark_df = ret_all_spark_df[ ret_all_spark_df.date == datetime.strptime(date, '%Y%m%d')].select( 'sec_id', 'nav').withColumnRenamed('nav', 'nt_val') # 做一下排序 保证ret按date有序 否則date在collect_list后不是順序 ret_all_spark_df = ret_all_spark_df.withColumn('ret_list',func.when(func.col('date') != func.col('the_last_date'), func.array(func.lit(0))).otherwise(func.collect_list('ret').over(w)))\ .withColumn('stock_ret_list', func.when(func.col('date') != func.col('the_last_date'), func.array(func.lit(0))).otherwise(func.collect_list('stock').over(w)))\ .withColumn('treasury_ret_list', func.when(func.col('date') != func.col('the_last_date'), func.array(func.lit(0))).otherwise(func.collect_list('treasury').over(w)))\ .withColumn('credit_ret_list', func.when(func.col('date') != func.col('the_last_date'), func.array(func.lit(0))).otherwise(func.collect_list('credit').over(w)))\ .withColumn('date_list', func.when(func.col('date') != func.col('the_last_date'), func.array(func.lit(datetime.strptime('2020-03-06','%Y-%m-%d')))).otherwise(func.collect_list('date').over(w))) nav_agg_part_1 = ret_all_spark_df[ ret_all_spark_df.date == ret_all_spark_df.the_last_date].select( 'sec_id', 'ret_list', 'stock_ret_list', 'treasury_ret_list', 'credit_ret_list', 'date_list', 'fnd_category') if is_debug: nav_agg_part_1.show() # 后面不需要用到ret_all_spark_df 把所有列全部drop掉 ret_all_spark_df = ret_all_spark_df.drop( 'sec_id', 'date', 'nav', 'ret', 'stock', 'treasury', 'credit', 'bench_ret', 'fnd_category', 'the_last_date', 'fund_length', 'ret_list', 'stock_ret_list', 'treasury_ret_list', 'credit_ret_list', 'date_list') if is_debug: ret_all_spark_df.show() # 取当前日净值 nav_agg_part_1 = nav_agg_part_1.join(nt_val_spark_df, on=['sec_id'], how='left') nav_agg_part_1 = nav_agg_part_1.withColumn('ret_mean', udf_mean('ret_list')) \ .withColumn('ret_std', udf_std('ret_list')) \ .withColumn('ret_min', udf_min('ret_list')) \ .withColumn('ret_max', udf_max('ret_list')) \ .withColumn('ret_p25', udf_p25('ret_list')) \ .withColumn('ret_median', udf_median('ret_list')) \ .withColumn('ret_p75', udf_p75('ret_list')) \ .withColumn('ret_skew', udf_skew('ret_list')) \ .withColumn('ret_kurtosis', udf_kurt('ret_list')) \ .withColumn('ret_start', udf_start('date_list')) \ .withColumn('cagr_sf', udf_cagr('ret_list'))\ .withColumn('cumret_sf', udf_cumret('ret_list'))\ .withColumn('vol_sf', udf_standard_deviation('ret_list'))\ .withColumn('md_sf', udf_max_drawdown('ret_list','date_list'))\ .withColumn('sharpe_sf', udf_sharpe('ret_list'))\ .withColumn('dvol_sf', udf_downside_deviation('ret_list'))\ .withColumn('alpha_sf', udf_alpha('ret_list','stock_ret_list','treasury_ret_list','credit_ret_list','fnd_category'))\ .withColumn('beta_sf', udf_marketbeta('ret_list','stock_ret_list'))\ .withColumn('ir_sf', udf_information('ret_list','stock_ret_list'))\ .withColumn('treynor_sf', udf_treynor('ret_list','stock_ret_list')) # drop 掉中间列 nav_agg_part_1 = nav_agg_part_1.drop('ret_list', 'stock_ret_list', 'treasury_ret_list', 'credit_ret_list', 'date_list', 'fnd_category') if is_debug: nav_agg_part_1.show() if is_write_file: nav_agg_part_1.write.option( 'header', 'true').mode('overwrite').csv(output_csv_path + str(date) + "/" + str(output_batch) + "/" + period) else: # 自定義函數 udf_cagr = func.udf(lambda x: float(Measure.cal_cagr(pd.Series(x))), FloatType()) udf_cumret = func.udf( lambda x: float(Measure.cal_cumret(pd.Series(x))), FloatType()) udf_aar = func.udf(lambda x: float(Measure.cal_aar(pd.Series(x))), FloatType()) udf_alpha = func.udf( lambda x, y, z, w, f: float( Measure.cal_alpha( pd.Series(x), pd.DataFrame({ 'stock': y, 'treasury': z, 'credit': w }), f)), FloatType()) udf_standard_deviation = func.udf( lambda x: float(Measure.cal_standard_deviation(pd.Series(x))), FloatType()) udf_downside_deviation = func.udf( lambda x: float(Measure.cal_downside_deviation(pd.Series(x))), FloatType()) udf_max_drawdown = func.udf( lambda x, y: float(Measure.cal_max_drawdown(pd.Series(x, index=y)) ), FloatType()) udf_marketbeta = func.udf( lambda x, y: float( Measure.cal_marketbeta(pd.Series(x), pd.Series(y))), FloatType()) udf_var = func.udf(lambda x: float(Measure.cal_var(pd.Series(x))), FloatType()) udf_sharpe = func.udf( lambda x: float(Measure.cal_sharpe(pd.Series(x))), FloatType()) udf_sortino = func.udf( lambda x: float(Measure.cal_sortino(pd.Series(x))), FloatType()) udf_calmar = func.udf( lambda x: float(Measure.cal_calmar(pd.Series(x))), FloatType()) udf_omega = func.udf(lambda x: float(Measure.cal_omega(pd.Series(x))), FloatType()) udf_information = func.udf( lambda x, y: float( Measure.cal_information(pd.Series(x), pd.Series(y))), FloatType()) udf_treynor = func.udf( lambda x, y: float(Measure.cal_treynor(pd.Series(x), pd.Series(y)) ), FloatType()) udf_m_square = func.udf( lambda x, y: float(Measure.cal_m_square(pd.Series(x), pd.Series(y)) ), FloatType()) udf_sterling = func.udf( lambda x: float(Measure.cal_sterling(pd.Series(x))), FloatType()) udf_burke = func.udf(lambda x: float(Measure.cal_burke(pd.Series(x))), FloatType()) udf_tail = func.udf(lambda x: float(Measure.cal_tail(pd.Series(x))), FloatType()) udf_rachev = func.udf( lambda x: float(Measure.cal_rachev(pd.Series(x))), FloatType()) udf_stability = func.udf( lambda x: float(Measure.cal_stability(pd.Series(x))), FloatType()) udf_min_monthly_return = func.udf( lambda x, y: float( Measure.cal_min_monthly_return(pd.Series(x, index=y))), FloatType()) udf_max_monthly_return = func.udf( lambda x, y: float( Measure.cal_max_monthly_return(pd.Series(x, index=y))), FloatType()) udf_monthly_odds = func.udf( lambda x, y: float(Measure.cal_monthly_odds(pd.Series(x, index=y)) ), FloatType()) udf_picking = func.udf( lambda x, y: float( Measure.cal_picking(pd.Series(x), pd.Series(y, name='stock'))), FloatType()) udf_timing = func.udf( lambda x, y: float( Measure.cal_timing(pd.Series(x), pd.Series(y, name='stock'))), FloatType()) udf_trackerror = func.udf( lambda x, y, z: float( Measure.cal_trackerror(pd.Series(x), pd.Series(y), z)), FloatType()) # 过滤出开始日期在基金发行日期之后的数据 ret_all_spark_df = ret_all_spark_df.withColumn( 'the_first_date', func.first('date').over(w)) ret_all_spark_df = ret_all_spark_df[ret_all_spark_df.the_first_date <= datetime.strptime(start, '%Y%m%d')] # 取start日期之前的数据 切片 ret_all_spark_df = ret_all_spark_df[ ret_all_spark_df.date >= datetime.strptime(start, '%Y%m%d')] # 过滤基金数据长度不够 又进行了切片 所以需要重新统计基金长度 ret_all_spark_df = ret_all_spark_df.withColumn( 'fund_length', func.count('date').over(w)) ret_all_spark_df = ret_all_spark_df[ ret_all_spark_df['fund_length'] >= 2] # 做一下排序 保证ret按date有序 否則date在collect_list后不是順序 ret_all_spark_df = ret_all_spark_df.withColumn('ret_list',func.when(func.col('date') != func.col('the_last_date'), func.array(func.lit(0))).otherwise(func.collect_list('ret').over(w)))\ .withColumn('stock_ret_list', func.when(func.col('date') != func.col('the_last_date'), func.array(func.lit(0))).otherwise(func.collect_list('stock').over(w)))\ .withColumn('treasury_ret_list', func.when(func.col('date') != func.col('the_last_date'), func.array(func.lit(0))).otherwise(func.collect_list('treasury').over(w)))\ .withColumn('credit_ret_list', func.when(func.col('date') != func.col('the_last_date'), func.array(func.lit(0))).otherwise(func.collect_list('credit').over(w))) \ .withColumn('bench_ret_list', func.when(func.col('date') != func.col('the_last_date'), func.array(func.lit(0))).otherwise(func.collect_list('bench_ret').over(w))) \ .withColumn('date_list', func.when(func.col('date') != func.col('the_last_date'), func.array(func.lit(datetime.strptime('2020-03-06','%Y-%m-%d')))).otherwise(func.collect_list('date').over(w))) nav_agg_part_2 = ret_all_spark_df[ ret_all_spark_df.date == ret_all_spark_df.the_last_date].select( 'sec_id', 'ret_list', 'stock_ret_list', 'treasury_ret_list', 'credit_ret_list', 'bench_ret_list', 'date_list', 'fnd_category') if is_debug: nav_agg_part_2.show() # 后面不需要用到ret_all_spark_df 把所有列全部drop掉 ret_all_spark_df = ret_all_spark_df.drop( 'sec_id', 'date', 'nav', 'ret', 'stock', 'treasury', 'credit', 'bench_ret', 'fnd_category', 'the_last_date', 'the_first_date', 'fund_length', 'ret_list', 'stock_ret_list', 'treasury_ret_list', 'credit_ret_list', 'bench_ret_list', 'date_list') if is_debug: ret_all_spark_df.show() nav_agg_part_2 = nav_agg_part_2.withColumn('cagr_' + period, udf_cagr('ret_list'))\ .withColumn('cumret_' + period, udf_cumret('ret_list'))\ .withColumn('aar_' + period, udf_aar('ret_list'))\ .withColumn('alpha_' + period, udf_alpha('ret_list','stock_ret_list','treasury_ret_list','credit_ret_list', 'fnd_category'))\ .withColumn('vol_' + period, udf_standard_deviation('ret_list'))\ .withColumn('dvol_' + period, udf_downside_deviation('ret_list'))\ .withColumn('md_' + period, udf_max_drawdown('ret_list', 'date_list'))\ .withColumn('beta_' + period, udf_marketbeta('ret_list', 'stock_ret_list'))\ .withColumn('var_' + period, udf_var('ret_list'))\ .withColumn('sharpe_' + period, udf_sharpe('ret_list'))\ .withColumn('sortino_' + period, udf_sortino('ret_list'))\ .withColumn('calmar_' + period, udf_calmar('ret_list'))\ .withColumn('omega_' + period, udf_omega('ret_list'))\ .withColumn('ir_' + period, udf_information('ret_list','stock_ret_list'))\ .withColumn('treynor_' + period, udf_treynor('ret_list','stock_ret_list'))\ .withColumn('m_square_' + period, udf_m_square('ret_list','stock_ret_list'))\ .withColumn('sterling_' + period, udf_sterling('ret_list'))\ .withColumn('burke_' + period, udf_burke('ret_list'))\ .withColumn('tail_' + period, udf_tail('ret_list'))\ .withColumn('rachev_' + period, udf_rachev('ret_list'))\ .withColumn('stability_' + period, udf_stability('ret_list')) if period in ['3m', '6m', '1y', '3y', '5y']: nav_agg_part_2 = nav_agg_part_2.withColumn('min_monthly_ret_' + period, udf_min_monthly_return('ret_list','date_list'))\ .withColumn('max_monthly_ret_' + period, udf_max_monthly_return('ret_list','date_list'))\ .withColumn('monthly_odds_' + period, udf_monthly_odds('ret_list', 'date_list')) if period in ['1m', '3m', '6m', '1y', '3y', '5y']: nav_agg_part_2 = nav_agg_part_2.withColumn('picking_' + period, udf_picking('ret_list', 'stock_ret_list'))\ .withColumn('timing_' + period, udf_timing('ret_list', 'stock_ret_list'))\ .withColumn('te_' + period, udf_trackerror('ret_list', 'bench_ret_list', 'fnd_category')) # drop 掉中间列表 nav_agg_part_2 = nav_agg_part_2.drop('ret_list', 'stock_ret_list', 'treasury_ret_list', 'credit_ret_list', 'bench_ret_list', 'date_list', 'fnd_category') if is_debug: nav_agg_part_2.show() if is_write_file: nav_agg_part_2.write.option( 'header', 'true').mode('overwrite').csv(output_csv_path + str(date) + "/" + str(output_batch) + '/' + period)
# %% # Extract event sequences and groundtruth udf_normalize = F.udf( lambda x: [[ (x[i][0] - x[0][0] + (x[-1][0] - x[0][0]) / (len(x) - 1)) / args.time_divisor, float(x[i][1]), ] for i in range(len(x))], psql.types.ArrayType(psql.types.ArrayType(psql.types.FloatType())), ) with Timer("extract event sequences"): event_seqs = (df_filtered.withColumn( "phrase", F.explode("phrases")).withColumn( "event", F.array("ts", "type")).groupby("phrase").agg( F.array_sort( F.collect_set("event")).alias("event_seq")).filter( F.size("event_seq").between( args.min_seq_length, args.max_seq_length)).withColumn( "event_seq", udf_normalize("event_seq"))).persist() event_seqs.limit(5).toPandas() # seq_lengths = ( # event_seqs.select("phrase", F.size("event_seq").alias("size")) # .groupby("size") # .count() # .sort("size")
def data2(self): return self.spark.range(10).toDF('id') \ .withColumn("ks", array([lit(i) for i in range(20, 30)])) \ .withColumn("k", explode(col('ks'))) \ .withColumn("v2", col('k') * 100) \ .drop('ks')
def data(self): return (self.spark.range(10).toDF("id").withColumn( "vs", array([lit(i) for i in range(20, 30) ])).withColumn("v", explode(col("vs"))).drop("vs"))
# sf.coalesce(sf.col(rule+"_ECOA"),sf.lit("$$$")), sf.lit("^"),\ sf.coalesce(sf.col(rule+"_ECOA_code"),sf.lit("$$$")), sf.lit("^"),\ # sf.coalesce(sf.col(rule+"_Evaluation"),sf.lit("$$$")), sf.lit("^"),\ sf.coalesce(sf.col(rule+"_Evaluation_code"),sf.lit("$$$")), sf.lit("^"),\ sf.coalesce(sf.col(rule+"_FilingDate"),sf.lit("$$$")), sf.lit("^"),\ sf.coalesce(sf.col(rule+"_PlaintiffName"),sf.lit("$$$")), sf.lit("^"),\ sf.coalesce(sf.col(rule+"_ReferenceNumber"),sf.lit("$$$")), sf.lit("^"),\ # sf.coalesce(sf.col(rule+"_Status"),sf.lit("$$$")), sf.lit("^"),\ sf.coalesce(sf.col(rule+"_StatusDate"),sf.lit("$$$")), sf.lit("^"),\ sf.coalesce(sf.col(rule+"_Status_code"),sf.lit("$$$")), sf.lit("^")\ ) \ ) #dfcsPublicRecordInt.show(2,False) dfcsPublicRecordInt = dfcsPublicRecordInt.withColumn( "csPublicRecordArray", sf.array([col for col in dfcsPublicRecordInt.columns if col.split("_")[0] == "newcsPublicRecord"]) ) dfexplode = dfcsPublicRecordInt.select(sf.col("id"),sf.col("year"),sf.col("month"),sf.col("day") \ ,sf.explode_outer("csPublicRecordArray").alias("csPublicRecord") \ ) dfsplitCol = dfexplode.withColumn("csPublicRecord_Amount",sf.split("csPublicRecord","\^")[0]) \ .withColumn("csPublicRecord_Bankruptcy_AdjustmentPercent",sf.split("csPublicRecord","\^")[1]) \ .withColumn("csPublicRecord_Bankruptcy_AssetAmount",sf.split("csPublicRecord","\^")[2]) \ .withColumn("csPublicRecord_Bankruptcy_LiabilitiesAmount",sf.split("csPublicRecord","\^")[3]) \ .withColumn("csPublicRecord_Bankruptcy_RepaymentPercent",sf.split("csPublicRecord","\^")[4]) \ .withColumn("csPublicRecord_Bankruptcy_Type", sf.lit(None).cast(StringType())) \ .withColumn("csPublicRecord_Bankruptcy_Type_code",sf.split("csPublicRecord","\^")[5]) \ .withColumn("csPublicRecord_BookPageSequence",sf.split("csPublicRecord","\^")[6]) \ .withColumn("csPublicRecord_ConsumerComment",sf.split("csPublicRecord","\^")[7]) \ .withColumn("csPublicRecord_Court", sf.lit(None).cast(StringType())) \
def main_sdg(spark=None): spark = spark # Leemos fichero de metadatos with open(f'{wd}' + '/data/sdg_metadata.json', 'r') as json_metadata: metadata = json.load(json_metadata) # Leemos fichero de entrada person_inputs = metadata['dataflows'][0]['sources'][0] person_inputs = glob.glob(f'{wd}' + person_inputs['path']) # Leemos fichero json de entrada with open(person_inputs[0], 'r') as json_entrada: entrada = json.load(json_entrada) df = spark.createDataFrame(entrada) # Inicializacion dataframes df_ok = None df_not_ok = None # Lista con todas las transformaciones transformaciones = metadata['dataflows'][0]['transformations'] # Proceso ejecucion for item0 in transformaciones: if item0['type'] == 'validate_fields': validaciones = item0['params']['validations'] for item1 in validaciones: validaciones_campo = item1['validations'] dicc = {} for item2 in validaciones_campo: df = df.withColumn( item1['field'] + '_' + item2, F.udf(validaciones_func)(F.col(item1['field']), F.lit(item2))) lista_cols_nuevas = [x for x in df.columns[3:]] df = df.withColumn('total', F.udf(suma_bool)(F.array(lista_cols_nuevas))) df_ok = df.filter(F.col('total') == True).drop('total') df_not_ok = df.filter(F.col('total') == False).drop('total') for item in lista_cols_nuevas: df_not_ok = df_not_ok.withColumn( 'code_' + item, F.when( F.col(item) == False, F.udf(crear_diccionario)( F.lit(item.split("_")[0]), F.lit(item.split("_")[1]))).otherwise(F.lit(None))) lista_cols_nuevas_code = [ x for x in df_not_ok.columns if 'code_' in x ] df_not_ok = df_not_ok.withColumn( 'code_total', F.udf(list_total)(F.array(lista_cols_nuevas_code))) lista_cols_borrar = lista_cols_nuevas + lista_cols_nuevas_code df_not_ok = df_not_ok.drop(*lista_cols_borrar) if (item0['type'] == 'add_fields') & (item0['params']['input'] == 'validation_ok'): for item3 in item0['params']['addFields']: tipo = item3['function'] df_ok = df_ok.withColumn(item3['name'], F.udf(anadir_campos)(F.lit(tipo))) if (item0['type'] == 'add_fields') & (item0['params']['input'] == 'validation_ko'): for item3 in item0['params']['addFields']: tipo = item3['function'] df_not_ok = df_not_ok.withColumn( item3['name'], F.udf(anadir_campos)(F.lit(tipo), F.col('code_total'))) # Dataframes finales y escritura a archivos .json en disco sinks = metadata['dataflows'][0]['sinks'] try: df_ok = df_ok.select('name', 'age', 'office', 'dt') except Exception as e: logging.info(e) pass try: df_not_ok = df_not_ok.select('name', 'age', 'office', 'dt', 'arraycoderrorbyfield') except Exception as e: logging.info(e) pass escritura(df_ok, sinks, 'ok_with_date', f'{wd}') escritura(df_not_ok, sinks, 'validation_ko', f'{wd}')
def test_auto_mapper_fhir_patient_resource_include_null_properties( spark_session: SparkSession, ) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "1970-01-01", "female"), (2, "Vidal", "Michael", "1970-02-02", None), ], ["member_id", "last_name", "first_name", "date_of_birth", "my_gender"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper( view="members", source_view="patients", keys=["member_id"] ).complex( Patient( id_=FhirId(A.column("member_id")), birthDate=A.date(A.column("date_of_birth")), name=FhirList( [HumanName(use=NameUseCode("usual"), family=A.column("last_name"))], include_null_properties=True, ), gender=A.if_not_null( A.column("my_gender"), AdministrativeGenderCode(A.column("my_gender")) ), ) ) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") result_df: DataFrame = mapper.transform(df=df) # Assert assert len(sql_expressions) == 21 assert str(sql_expressions["id"]) == str( substring( regexp_replace(col("b.member_id"), r"[^A-Za-z0-9\-\.]", "_"), 0, 63 ).alias("id") ) assert str(sql_expressions["resourceType"]) == str( lit("Patient").alias("resourceType") ) assert str(sql_expressions["birthDate"]) == str( coalesce( to_date(col("b.date_of_birth"), "y-M-d"), to_date(col("b.date_of_birth"), "yyyyMMdd"), to_date(col("b.date_of_birth"), "M/d/y"), ).alias("birthDate") ) assert str(sql_expressions["name"]) == str( filter( array( struct( lit("usual").alias("use"), lit(None).alias("text"), col("b.last_name").alias("family"), lit(None).alias("given"), lit(None).alias("prefix"), lit(None).alias("suffix"), lit(None).alias("period"), ) ), lambda x: x.isNotNull(), ).alias("name") ) assert str(sql_expressions["gender"]) == str( when(col("b.my_gender").isNull(), None) .otherwise(col("b.my_gender")) .alias("gender") ) result_df.printSchema() result_df.show() assert ( result_df.where("member_id == 1").selectExpr("name[0].use").collect()[0][0] == "usual" ) assert ( result_df.where("member_id == 1").selectExpr("name[0].family").collect()[0][0] == "Qureshi" ) assert ( result_df.where("member_id == 2").selectExpr("name[0].use").collect()[0][0] == "usual" ) assert ( result_df.where("member_id == 2").selectExpr("name[0].family").collect()[0][0] == "Vidal" )
def create_word_vecs( df, word_col, desired_ops, word_vec_col="word_vec", normalize=False, offset_vals=None, scale_vals=None, clip_rng=None, ndigits=None, ): """ Creates a "word vec" from "word". A word is a list of N coordinates that may or may not be consecutive, and a word vecs is a set of numbers that represents a word. Normalizers standardises word vecs by (original value) - (offset_val) (standardized value) = ------------------------------------ (scale_val) Currently only supports 1 normalization method: - mean-MAD: offset_val is the mean and scale_val is the MAD Paramters --------- df: A pyspark.sql.dataframe.DataFrame. word_col: String. Name of the column that contains a word. desired_ops: List of tuples, or list of list of tuples. A list of operations to execute on df to get the word vec. Each tuple (OP_NAME, i, j, ...) is an operation where - OP_NAME = name of the operation used in ops_dict, - i, j, ... = parameters to the lambda function for OP_NAME Tuples in the same list are normalized together. See ops_dict in the code for more detail. word_vec_col: String. Name of the new word vec column. normalize: One of {False, 'mean-mad'}. How to normalize word vec. offset_vals: List of floats (optional). The offset value for each component in the word vec (used for inferencing). scale_vals: List of floats (optional) The scale value for each component in the word vec (used for inferencing). clip_rng: Tuple of two integers (optional). The (min, max) range to clip each component in the word vec. Values smaller than min are repaced with min; values greater than max are capped at max. ndigits: Integer (optional). Number of decimal digits to round each component in the word vec to. Returns ------- Three objects: - A pyspark.sql.dataframe.DataFrame with the word column replaced by a new word vec column, - offset_vals (list of floats) - scale_vals (list of floats) """ # Check normalize parameter assert normalize in {False, "mean-mad"} othercols = list(set(df.columns) - {word_col}) # ks are kx, ky, the projection multipliers with_ks_df = cheap_ruler(df, word_col) # A dictionary of operations (features). # Must return operations as a single-item list, or else will run into error. ops_dict = { # Distance project to E-W "dx": lambda ii, ff: [(with_ks_df[word_col][ff][0] - with_ks_df[word_col][ii] [0]) * with_ks_df.kx], # Distance projected to N-S "dy": lambda ii, ff: [(with_ks_df[word_col][ff][1] - with_ks_df[word_col][ii] [1]) * with_ks_df.ky], # Distance "d": lambda ii, ff: [ vsqrt( vpow( (with_ks_df[word_col][ff][0] - with_ks_df[word_col][ii][0]) * with_ks_df.kx, 2, ) + vpow( (with_ks_df[word_col][ff][1] - with_ks_df[word_col][ii][1]) * with_ks_df.ky, 2, )) ], # Altitude "al": lambda ii, ff: [with_ks_df[word_col][ff][2] - with_ks_df[word_col][ii][2]], # Duration "t": lambda ii, ff: [with_ks_df[word_col][ff][3] - with_ks_df[word_col][ii][3]], # Speed "s": lambda ii, ff: [(vsqrt( vpow( (with_ks_df[word_col][ff][0] - with_ks_df[word_col][ii][0]) * with_ks_df.kx, 2, ) + vpow( (with_ks_df[word_col][ff][1] - with_ks_df[word_col][ii][1]) * with_ks_df.ky, 2, )) / (with_ks_df[word_col][ff][3] - with_ks_df[word_col][ii][3]))], } # Given a list of desired operations, find the operation in a dictionary and # add to an execution plan. Retain group structure for multiple-column # normalization. ops = [] # a list of operations to be added to the execution plan col_grps = [ ] # a list of column names that stores the result of those operations for i, op_grp in enumerate(desired_ops): col_grp = [] for j, op in enumerate(op_grp): op_name, ii, ff = op # Column to store result of operation col_name = "_" + str(i) + "_" + str(j) col_grp += (col_name, ) # Find the operation in the dictionary and add to ops ops += (ops_dict[op_name](ii, ff)[0].alias(col_name), ) col_grps += (col_grp, ) # Flatten the list word_vec_cols = [c for grp in col_grps for c in grp] with_raw_word_vecs = with_ks_df.select(*othercols, *ops) # Normalize if normalize: if scale_vals is None or offset_vals is None: # Compute mean and mad for every group offset_vals = [] scale_vals = [] for grp in col_grps: mu = compute_mean(with_raw_word_vecs, grp) mad = compute_mad(with_raw_word_vecs, grp, mean_val=mu) offset_vals += [mu] * len(grp) scale_vals += [mad] * len(grp) scale_ops = [] scaled_word_vec_cols = [] for i, cname in enumerate(word_vec_cols): scaled_cname = cname + "_scaled" scale_ops += (((with_raw_word_vecs[cname] - offset_vals[i]) / scale_vals[i]).alias(scaled_cname), ) scaled_word_vec_cols += (scaled_cname, ) with_word_vecs = with_raw_word_vecs.select(*othercols, *scale_ops) # Clip features if clip_rng is not None: with_word_vecs = clip(with_word_vecs, scaled_word_vec_cols, clip_rng) word_vec_cols = scaled_word_vec_cols else: with_word_vecs = with_raw_word_vecs offset_vals = None scale_vals = None # Round to desired decimal digits if ndigits is not None: with_word_vecs = round_columns(with_word_vecs, word_vec_cols, decimals=ndigits) # Combine columns into a vec res_df = with_word_vecs.select( *othercols, array(*(with_word_vecs[col] for col in word_vec_cols)).alias(word_vec_col)) return res_df, offset_vals, scale_vals
def data(self): return self.spark.range(10).toDF('id') \ .withColumn("vs", array([lit(i * 1.0) + col('id') for i in range(20, 30)])) \ .withColumn("v", explode(col('vs'))) \ .drop('vs') \ .withColumn('w', lit(1.0))
def gngraph_datarepo_qry_getedges(self, dnodeDF, sqlst, nodemode): try: # first map gnedges gnsrch_log( 'GnSrchOps:datanodes querying for edges and derived nodes flages ' ) self.get_metaedges_mapped_df() self.get_metanodes_mapped_df() gnsrch_log('GnSrchOps: Enumerating edges for datanodes on join ') cond = [ ((self.__gnmetaEdgesDF_cached.gntgtnodeid == dnodeDF.gnnodeid) | (self.__gnmetaEdgesDF_cached.gnsrcnodeid == dnodeDF.gnnodeid)) & (self.__gnmetaEdgesDF_cached.gnedgetype == 'GNDataNodeEdge') ] jDF = self.__gnmetaEdgesDF_cached.join(dnodeDF, cond, 'inner') jDF.show(4) e1DF = jDF.select("gnedgeid", "gnedgename", "gnedgetype", "gnsrcnodeid", "gntgtnodeid") eDF = e1DF.dropDuplicates(['gnedgeid']).sort('gnedgeid') ecount = eDF.count() gnsrch_log('GnSrchOps: showing unique edges #nodes ' + str(ecount)) eDF.show(5) mcols = [F.col("gnsrcnodeid"), F.col("gntgtnodeid")] res = eDF.withColumn("edgenodes", F.array(mcols))\ .select("edgenodes") gnsrch_log('GnSrchOps: gnedges filter result 1 ') res.show(5) f1DF = res.select(F.explode(F.col("edgenodes")).alias("gnnodeid")) f1count = f1DF.count() gnsrch_log('GnSrchOps: Filter datanodes exploded #nodes ' + str(f1count)) f1DF.show(10) gnsrch_log('GnSrchOps: Filtered datanodes and remove duplicates ') f2DF = f1DF.select("gnnodeid").distinct().sort("gnnodeid") f2count = f2DF.count() gnsrch_log('GnSrchOps: Filter nodes and distict #nodes ' + str(f2count)) f2DF.show(10) derivedNodeDF = dnodeDF.select("gnnodeid").join( f2DF, on=['gnnodeid'], how='left_anti').distinct().orderBy('gnnodeid') gnsrch_log('GnSrchOps: Enumerating derived datanodes ') nderivedNodes = derivedNodeDF.count() gnsrch_log('GnSrchOps: derived datanodes #of nodes ' + str(nderivedNodes)) dnJson = {} dnDF = None if (nderivedNodes > 0): derivedNodeDF.show(10) deriveNodeList = derivedNodeDF.collect() derived_NodeList = [] for row in derivedNodeList: ####print(row['fnodes']) derived_NodeList.append(row['gnnodeid']) ### now iterate over list and get gnnode gnsrch_log('GnSrchOps: Node info for derived datanodes ') gnsrch_log(derived_NodeList) nodelist = [] nodeid_list = "( " i = 0 for x in derived_NodeList: if (i > 0): nodeid_list += "," nodeid_list += "" + str(x) + "" i = i + 1 nodeid_list += ")" gnsrch_log('GnSrchOps: Getting node info for list ' + nodeid_list) sqlstr = "SELECT * from gnmetanodes where gnnodeid in " + nodeid_list + " " gnsrch_log('GnGraphSearchOps: executing sql ' + sqlstr) dnDF = self.__spark.sql(sqlstr) #resJson = jDF.toJSON().map(lambda j: json.loads(j)).collect() ###dnJson = dnDF.toJSON().collect() dnCount = dnDF.count() gnsrch_log('GnSrchOps: Derived datanodes enumerated #nodes ' + str(dnCount)) except Exception as err: gnsrch_log('GnSrchOps: Exception received ' + str(err)) eDF = None dbDF = None ####print(nodelist) gnsrch_log('GnSrchOps: Completed datanodes gnedges fetch ') return (eDF, dnDF)
def test_smvGetColName(self): df = self.createDF("k:String; v:String;", "a,b;c,d;,") self.assertEqual(df.k.smvGetColName(), 'k') self.assertEqual(array(df.k, df.v).smvGetColName(), 'array(k,v)')
def data(self): return self.spark.range(10).toDF('id') \ .withColumn("vs", array([lit(i) for i in range(20, 30)])) \ .withColumn("v", explode(col('vs'))).drop('vs')
def with_explode_column(df): import pyspark.sql.functions as F df2 = df.withColumn('values', F.array(F.lit(1), F.lit(2))) df2 = df2.withColumn('value', F.explode(df2.values)) return df2