Example #1
0
def text_features(p_df):
    """
    Extracts features derived from the quora question texts.
    :param p_df: A DataFrame.
    :return: A DataFrame.  
    """
    diff_len = udf(lambda arr: arr[0] - arr[1], IntegerType())
    common_words = udf(lambda arr: len(set(arr[0]).intersection(set(arr[1]))), IntegerType())
    unique_chars = udf(lambda s: len(''.join(set(s.replace(' ', '')))), IntegerType())


    p_df = p_df.withColumn("len_q1", length("question1")).withColumn("len_q2", length("question2"))
    p_df = p_df.withColumn("diff_len", diff_len(array("len_q1", "len_q2")))
    p_df = p_df.withColumn("words_q1", size("question1_words")).withColumn("words_q2", size("question2_words"))
    p_df = p_df.withColumn("common_words", common_words(array("question1_words", "question2_words")))
    p_df = p_df.withColumn(
        "unique_chars_q1", unique_chars("question1")
    ).withColumn("unique_chars_q2", unique_chars("question2"))

    assembler = VectorAssembler(
        inputCols=["len_q1", "len_q2", "diff_len", "words_q1", "words_q2", "common_words", "unique_chars_q1", "unique_chars_q2"],
        outputCol="text_features"
    )
    p_df = assembler.transform(p_df)    
    return p_df
 def data(self):
     from pyspark.sql.functions import array, explode, col, lit
     return self.spark.range(10).toDF('id') \
         .withColumn("vs", array([lit(i * 1.0) + col('id') for i in range(20, 30)])) \
         .withColumn("v", explode(col('vs'))) \
         .drop('vs') \
         .withColumn('w', lit(1.0))
 def test_vectorized_udf_basic(self):
     from pyspark.sql.functions import pandas_udf, col, array
     df = self.spark.range(10).select(
         col('id').cast('string').alias('str'),
         col('id').cast('int').alias('int'),
         col('id').alias('long'),
         col('id').cast('float').alias('float'),
         col('id').cast('double').alias('double'),
         col('id').cast('decimal').alias('decimal'),
         col('id').cast('boolean').alias('bool'),
         array(col('id')).alias('array_long'))
     f = lambda x: x
     str_f = pandas_udf(f, StringType())
     int_f = pandas_udf(f, IntegerType())
     long_f = pandas_udf(f, LongType())
     float_f = pandas_udf(f, FloatType())
     double_f = pandas_udf(f, DoubleType())
     decimal_f = pandas_udf(f, DecimalType())
     bool_f = pandas_udf(f, BooleanType())
     array_long_f = pandas_udf(f, ArrayType(LongType()))
     res = df.select(str_f(col('str')), int_f(col('int')),
                     long_f(col('long')), float_f(col('float')),
                     double_f(col('double')), decimal_f('decimal'),
                     bool_f(col('bool')), array_long_f('array_long'))
     self.assertEquals(df.collect(), res.collect())
    def test_manual(self):
        df = self.data
        sum_udf = self.pandas_agg_sum_udf
        mean_udf = self.pandas_agg_mean_udf
        mean_arr_udf = pandas_udf(
            self.pandas_agg_mean_udf.func,
            ArrayType(self.pandas_agg_mean_udf.returnType),
            self.pandas_agg_mean_udf.evalType)

        result1 = df.groupby('id').agg(
            sum_udf(df.v),
            mean_udf(df.v),
            mean_arr_udf(array(df.v))).sort('id')
        expected1 = self.spark.createDataFrame(
            [[0, 245.0, 24.5, [24.5]],
             [1, 255.0, 25.5, [25.5]],
             [2, 265.0, 26.5, [26.5]],
             [3, 275.0, 27.5, [27.5]],
             [4, 285.0, 28.5, [28.5]],
             [5, 295.0, 29.5, [29.5]],
             [6, 305.0, 30.5, [30.5]],
             [7, 315.0, 31.5, [31.5]],
             [8, 325.0, 32.5, [32.5]],
             [9, 335.0, 33.5, [33.5]]],
            ['id', 'sum(v)', 'avg(v)', 'avg(array(v))'])

        self.assertPandasEqual(expected1.toPandas(), result1.toPandas())
Example #5
0
    def test_smvArrayFlatten(self):
        df = self.createDF('a:String;b:String;c:String', ',,;1,2,;2,3,4')
        df1 = df.select(F.array(
            F.array(F.lit(None), F.col('a')),
            F.array(F.col('a'), F.col('b'), F.col('c'))
        ).alias('aa'))

        res1 = df1.select(F.col('aa').smvArrayFlatten(StringType()).alias('a'))\
            .select(SF.smvArrayCat('|', F.col('a')).alias('k'))

        exp = self.createDF("k: String",
        """||||;
            |1|1|2|;
            |2|2|3|4""")

        res2 = df1.select(F.col('aa').smvArrayFlatten(df1).alias('a'))\
            .select(SF.smvArrayCat('|', F.col('a')).alias('k'))

        self.should_be_same(res1, exp)
        self.should_be_same(res2, exp)
    def test_array_type_correct(self):
        df = self.data.withColumn("arr", array(col("id"))).repartition(1, "id")

        output_schema = StructType(
            [StructField('id', LongType()),
             StructField('v', IntegerType()),
             StructField('arr', ArrayType(LongType()))])

        udf = pandas_udf(
            lambda pdf: pdf,
            output_schema,
            PandasUDFType.GROUPED_MAP
        )

        result = df.groupby('id').apply(udf).sort('id').toPandas()
        expected = df.toPandas().groupby('id').apply(udf.func).reset_index(drop=True)
        self.assertPandasEqual(expected, result)
Example #7
0
File: np1.py Project: dtorne/np-py
def solveCNF(path="", startLevel=10):

    # 20 Vars - uf20-01.cnf # 4726 sol. 7714 nodes
    # 50 Vars - uf50-05.cnf

    if path == "":
        path = path

    expandids = udf(expandLevel, ArrayType(LongType()))

    graphDF = sc.parallelize([])
    (numVar, numClauses, clauses) = parseNPFunction(path)

    initData = [i for i in range(0, 2**(startLevel))]
    initRDD = sc.parallelize(initData)
    initDF = initRDD.map(lambda x: Row(id=long(x), level=startLevel))

    graphDF = sqlContext.createDataFrame(initDF)

    for newvar in range(startLevel + 1, numVar + 1):
        walkerDF = graphDF.select("id").filter(
            col("level") == startLevel).withColumn("state", lit(3))

        for walklevel in range(startLevel, newvar):
            if newvar == startLevel + 1:
                reducedClause = minimClause(clauses,
                                            walklevel,
                                            newvar,
                                            onlyNewVariable=False)
            else:
                reducedClause = minimClause(clauses, walklevel, newvar)

            #Parse state with clauses
            walkerDF = walkerDF.rdd.map(lambda row: Row(
                id=row['id'],
                state=updateState(reducedClause, row['id'], row['state'],
                                  walklevel, newvar))).toDF()
            #Creates hash = id mod 2^level in graphDF to delete all branches of a no solution
            graphDF = graphDF.select("*").withColumn(
                "hashID", graphDF['id'] % (2**walklevel))
            #joins graphDF with updted status walkerDF and deletes all non solutions at this level, with new variable (marked by state=-1). Puts state=0 (not used) to those ids not existing in walkingDF
            hashGraph = graphDF.join(
                walkerDF,
                walkerDF["id"] == graphDF["hashID"], how='leftouter').select(
                    graphDF["id"], col("level"), col("hashID"),
                    col("state")).fillna(0).where(col("state") != -1)
            #Not leaf of tree/graph
            if walklevel + 1 != newvar:
                walkerDF = hashGraph.select(
                    "id", "state").where((hashGraph["state"] != 0)
                                         & (col("level") == (walklevel + 1)))
                graphDF = hashGraph.select("id", "level")
            #Leaf of tree/graph
            else:
                #Expand new level with the new Variable
                newLevelDF = walkerDF.withColumn(
                    "id",
                    explode(
                        array(expandids(col("id"), col("state"),
                                        lit(newvar))))).withColumn(
                                            "level", lit(newvar)).select(
                                                explode("id").alias("id"),
                                                "level")
                graphDF = graphDF.select("id",
                                         "level").union(newLevelDF).cache()

    return graphDF
Example #8
0
 def test_smvIsAnyIn(self):
     df = self.createDF("k:String; v:String;", "a,b;c,d;,").select(array(col("k"), col("v")).alias("arr"))
     res = df.select(col("arr").smvIsAnyIn("a", "z").alias("isFound"))
     expected = self.createDF("isFound:Boolean", "true;false;false")
     self.should_be_same(expected,res)
#   Note the distance to the nearest point in time leading or lagging you
#   Note the distance to of that nearest point to its neighbor
#   If you are closer to your neighbor than it is to it's closest merge and create a new point with a new outage time


def timestamp_average(timestamps):
    seconds = 0
    for i in range(0, len(timestamps)):
        seconds += timestamps[i]

    return int(seconds / len(timestamps))


max_cluster_size = 500
pw_df = pw_df.select(
    array("core_id").alias("core_id"),
    array("tx").alias("tx"),
    array("feeder_id").alias("feeder_id"), "outage_time",
    array("restore_time").alias("restore_time"),
    array(F.struct("location_latitude",
                   "location_longitude")).alias("location"))

pw_df = pw_df.withColumn("outage_times", F.array("outage_time"))

#print("Starting with count:", pw_df.count())
pw_finalized_outages = spark.createDataFrame([], pw_df.schema)

# all of the local checkpoints should probably be switched to just checkpoints
# note the checkpointing is CRITICAL to the function of the algorithm in spark
# otherwise the RDD lineage is recalculated every loop and the plan creation time balloons exponentially
# checkpointing truncates the plan
Example #10
0
def oversample_df(major_df_count, minor_df_count, major_df, minor_df):
    ratio = range(round(major_df_count / minor_df_count))
    oversampled_df = minor_df.withColumn("nv",\
                     explode(array([lit(x) for x in ratio]))).drop('nv')
    combined_df = major_df.unionAll(oversampled_df)
    return combined_df
    cancer = 'Cancer_{}'.format(i)
    seq = 'Seq_{}'.format(i)
    df_04 = df_04.withColumn(
        cancer, F.concat_ws('_',
                            F.col('items').getItem(i - 1), F.col(seq)))
    if (i < 4):
        df_03 = df_03.withColumn(
            cancer, F.concat_ws('_',
                                F.col('items').getItem(i - 1), F.col(seq)))
        if (i < 3):
            df_02 = df_02.withColumn(
                cancer,
                F.concat_ws('_',
                            F.col('items').getItem(i - 1), F.col(seq)))

df_02 = df_02.withColumn('items', F.array('Cancer_1', 'Cancer_2'))
df_03 = df_03.withColumn('items', F.array('Cancer_1', 'Cancer_2', 'Cancer_3'))
df_04 = df_04.withColumn(
    'items', F.array('Cancer_1', 'Cancer_2', 'Cancer_3', 'Cancer_4'))
df_02 = df_02.select('Patient_ID', 'items', 'Sex', 'Ages')
df_03 = df_03.select('Patient_ID', 'items', 'Sex', 'Ages')
df_04 = df_04.select('Patient_ID', 'items', 'Sex', 'Ages')

data = df_02.union(df_03).union(df_04)
data = data_final
data = data.select('Patient_ID', 'items', 'Sex', 'Ages')

for i in range(4):
    j = i + 1
    level = 'Level_{}'.format(j)
    age = 'Age_{}'.format(j)
Example #12
0
dataset = dataset.withColumn("year", (F.col("year") - min_year) /
                             (max_year - min_year))

# Normalizamos columnas mes, día, hora, minuto y segundo

dataset = dataset.withColumn("month", (F.col("month") - 1) / (12 - 1))
dataset = dataset.withColumn("day", (F.col("day") - 1) / (31 - 1))
dataset = dataset.withColumn("hour", (F.col("hour") - 0) / (23 - 0))
dataset = dataset.withColumn("minute", (F.col("minute") - 0) / (59 - 0))
dataset = dataset.withColumn("second", (F.col("second") - 0) / (59 - 0))

# Word2Vec

dataset = dataset.withColumn(
    'categorical',
    F.concat(F.array('rat'), F.array('mcc'), F.array('mnc'), F.array('msin'),
             F.array('tac'), F.array('snr')))

word2Vec_output_path = "{}/data/word2VecModel.bin".format(base_path)
word2Vec = Word2VecModel.load(word2Vec_output_path)
dataset = word2Vec.transform(dataset)

# VectorAssembler

sizeHint = VectorSizeHint(inputCol="vcategorical",
                          handleInvalid="skip",
                          size=50)
dataset = sizeHint.transform(dataset)

vector_assembler_output_path = "{}/data/vectorAssemblerW2VModel.bin".format(
    base_path)
Example #13
0
#documentDF = sqlContext.createDataFrame([
#    ("Hi I heard about Spark".split(" "), ),
#    ("I wish Java could use case classes".split(" "), ),
#    ("Logistic regression models are neat".split(" "), )
#], ["text"])

#print(documentDF.printSchema())
#print(df_raw8.printSchema())


# In[78]:


import pyspark.sql.functions as F
df_raw8 = df_raw8.withColumn("new_text", F.array(F.col("text")))


# In[79]:


df_raw8.show()


# In[80]:


from pyspark.ml.feature import Word2Vec
from pyspark.sql import SQLContext

sqlContext = SQLContext(spark)
#print (df2_user.dtypes)

df2_user_clean = df2_user.drop_duplicates(
    [' TimeSt', 'Country', 'Province', 'City', 'Latitude', 'Longitude'])

print('length of clean dataframe: %d' % df2_user_clean.count())

#a1 = np.array(df_poi['lat'])
#a2 = np.array(df_poi['long'])
#a3 = np.array(df_poi['poi_id'])
#array_poi = df_poi.select(array(' Latitude','Longitude','POIID')).collect()
array_poi = df_poi.sort("Longitude", ascending=True).collect()

#a4 = np.array(df2_user_clean['Latitude'])
#a5 = np.array(df2_user_clean['Longitude'])
array_userLoc = df2_user_clean.select(array('Latitude', 'Longitude')).collect()

# In[150]:

kdT = create_KD_Tree(array_poi)
print("\n\nk-d Tree: %s" % (kdT))

# In[151]:

pos_list = []
pos_dis = []

for point in array_userLoc:
    return_val = nn_kdtree2(point, kdT)
    pos_ans = return_val[0]
    dis_ans = haversine_dis(point, return_val)
Example #15
0
origin_desc = "origin_desc"
dest_desc = "dest_desc"

airports = airports.dropDuplicates(['code'])
carriers = carriers.dropDuplicates(['code'])

air_car = airlines.join(carriers, airlines.carrier == carriers.code).select(
    [a for a in airlines.columns] + [carriers.description.alias(carrier_desc)])

print "\n\n\n"
print air_car.head(1)

combined = air_car.join(airports, air_car.origin == airports.code)\
  .select([a for a in air_car.columns] + [airports.description.alias(origin_desc)])\
  .join(airports, air_car.dest == airports.code)\
  .select([a for a in air_car.columns] + [origin_desc] + [airports.description.alias(dest_desc)])

print "\n\n\n"
print combined.head(2)
print "\n\n\n"

combined = combined.withColumn(
    "origin_dest_names", array(origin_desc,
                               dest_desc)).drop(origin_desc).drop(dest_desc)
combined = combined.toDF(*[a.lower() for a in combined.columns])

combined.registerTempTable("temp_table")
hive_context.sql(
    "CREATE TABLE flight.flight_data_denorm STORED AS ORC AS SELECT * from temp_table"
)
Example #16
0
def test_score_logistic_model(spark):
    lvl1df = spark.read.parquet(
        f'{data_root}/bt_reduceded_1part.snappy.parquet')
    sample_blocks_df = spark.read.parquet(f'{data_root}/groupedIDs.snappy.parquet') \
        .withColumn('sample_block', f.col('sample_block').cast('string')) \
        .withColumn('sample_ids', f.expr('transform(sample_ids, v -> cast(v as string))'))
    sample_blocks = {
        r.sample_block: r.sample_ids
        for r in sample_blocks_df.collect()
    }
    with open(f'{data_root}/test_score_logistic_model.json') as json_file:
        test_values = json.load(json_file)

    map_key_pattern = ['sample_block', 'label', 'alpha_name']
    reduce_key_pattern = ['header_block', 'header', 'label', 'alpha_name']
    model_key_pattern = ['sample_block', 'label', 'alpha_name']
    score_key_pattern = ['sample_block', 'label']

    map_udf = f.pandas_udf(
        lambda key, pdf:
        map_irls_eqn(key, map_key_pattern, pdf, labeldf, sample_blocks, covdf,
                     beta_cov_dict, maskdf, alphas), irls_eqn_struct,
        PandasUDFType.GROUPED_MAP)

    reduce_udf = f.pandas_udf(
        lambda key, pdf: reduce_irls_eqn(key, reduce_key_pattern, pdf),
        irls_eqn_struct, PandasUDFType.GROUPED_MAP)

    model_udf = f.pandas_udf(
        lambda key, pdf: solve_irls_eqn(key, model_key_pattern, pdf, labeldf,
                                        alphas, covdf), model_struct,
        PandasUDFType.GROUPED_MAP)

    score_udf = f.pandas_udf(
        lambda key, pdf: score_models(key,
                                      score_key_pattern,
                                      pdf,
                                      labeldf,
                                      sample_blocks,
                                      alphas,
                                      covdf,
                                      maskdf,
                                      metric='log_loss'), cv_struct,
        PandasUDFType.GROUPED_MAP)

    modeldf = lvl1df \
        .withColumn('alpha_name', f.explode(f.array([f.lit(n) for n in alphas.keys()]))) \
        .groupBy(map_key_pattern) \
        .apply(map_udf) \
        .groupBy(reduce_key_pattern) \
        .apply(reduce_udf) \
        .groupBy(model_key_pattern) \
        .apply(model_udf) \
        .withColumn('alpha_label_coef', f.expr('struct(alphas[0] AS alpha, labels[0] AS label, coefficients[0] AS coefficient)')) \
        .groupBy('header_block', 'sample_block', 'header', 'sort_key', f.col('alpha_label_coef.label')) \
        .agg(f.sort_array(f.collect_list('alpha_label_coef')).alias('alphas_labels_coefs')) \
        .selectExpr('*', 'alphas_labels_coefs.alpha AS alphas', 'alphas_labels_coefs.label AS labels', 'alphas_labels_coefs.coefficient AS coefficients') \
        .drop('alphas_labels_coefs', 'label')

    cvdf = lvl1df.drop('header_block', 'sort_key') \
        .join(modeldf, ['header', 'sample_block'], 'right') \
        .withColumn('label', f.coalesce(f.col('label'), f.col('labels').getItem(0))) \
        .groupBy(score_key_pattern) \
        .apply(score_udf)

    outdf = cvdf.filter(
        f'sample_block = "{test_sample_block}" AND label = "{test_label}"'
    ).toPandas()
    scores_glow = outdf['score'].to_numpy()

    assert (np.allclose(np.array(test_values['scores']), scores_glow))
Example #17
0
    spark
    .read
    .option("flattenInfoFields", False)
    .format('vcf')
    .load(INPUT_VCF)
)

df = glow.transform("split_multiallelics", df)

df.printSchema()

df = df.withColumn("names", f.array([f.concat(
    f.col('contigName'),
    f.lit(":"),
    f.col('start') + 1,
    f.lit(":"),
    f.col('referenceAllele'),
    f.lit(">"),
    f.col('alternateAlleles')[0]
)]))

df.limit(10).toPandas()

# +
import json
import shlex

input_df = df.select([
    f.col('contigName'),
    f.col('start'),
    f.col('end'),
df_device = df.select([col for col in df.columns if not col.startswith("rule_")])

ruleCnt = sorted([int(col.split("_")[1]) for col in df.columns if col.split("_")[0] == "rule" and col.split("_")[2] in ["reason","score","type"]])[-1]

for i in range(ruleCnt+1):
    rule = "rule_"+str(i)
    df = df.withColumn("new" + rule, \
                        sf.concat(
                            sf.coalesce(sf.col(rule+"_reason"),sf.lit("$$$")), sf.lit("^"), \
                            sf.coalesce(sf.col(rule+"_score"),sf.lit("$$$")), sf.lit("^"),\
                            sf.coalesce(sf.col(rule+"_type"),sf.lit("$$$")) \
                            ) \
                            )

df = df.withColumn( "ruleArray", sf.array([col for col in df.columns if col.split("_")[0] == "newrule"]) )

dfexplode = df.select(sf.col("id"),sf.col("applicantId"),sf.col("loan_application_id"),sf.col("createdDatePT"),sf.col("year"),sf.col("month"),sf.col("day") \
            ,sf.explode_outer("ruleArray").alias("rule") \
            )

dfsplitCol = dfexplode.withColumn("rule_reason",sf.split("rule","\^")[0]) \
         .withColumn("rule_score",sf.split("rule","\^")[1]) \
         .withColumn("rule_type",sf.split("rule","\^")[2]) 

df_rules = dfsplitCol.select(sf.col("id"),sf.col("applicantId"),sf.col("loan_application_id"),sf.col("createdDatePT"),sf.col("year"),sf.col("month"),sf.col("day"), \
                sf.when(sf.col("rule_reason")=="$$$",None).otherwise(sf.col("rule_reason")).alias("rule_reason"), \
                sf.when(sf.col("rule_score")=="$$$",None).otherwise(sf.col("rule_score")).cast("integer").alias("rule_score"), \
                sf.when(sf.col("rule_type")=="$$$",None).otherwise(sf.col("rule_type")).alias("rule_type")) \
                .where(sf.col("rule_reason").isNotNull() | sf.col("rule_score").isNotNull() | sf.col("rule_type").isNotNull())
Example #19
0
def parse(path_to_dir):
    global TARGET_DIR
    TARGET_DIR = os.path.join(TARGET_DIR, os.path.split(path_to_dir)[1])

    if 'DAS5' in os.environ:  # If we want to execute it on the DAS-5 super computer
        print("We are on DAS5, {0} is master.".format(os.environ['HOSTNAME'] +
                                                      ".ib.cluster"))
        spark = SparkSession.builder \
            .master("spark://" + os.environ['HOSTNAME'] + ".ib.cluster:7077") \
            .appName("WTA parser") \
            .config("spark.executor.memory", "28G") \
            .config("spark.executor.cores", "8") \
            .config("spark.executor.instances", "10") \
            .config("spark.driver.memory", "40G") \
            .config("spark.sql.execution.arrow.enabled", "true") \
            .getOrCreate()
    else:
        findspark.init(spark_home="<path to spark>")
        spark = SparkSession.builder \
            .master("local[8]") \
            .appName("WTA parser") \
            .config("spark.executor.memory", "20G") \
            .config("spark.driver.memory", "8G") \
            .getOrCreate()

    if not os.path.exists(os.path.join(TARGET_DIR, Task.output_path())):
        print("######\nStart parsing Tasks\n######")
        task_df = spark.read.format('com.databricks.spark.csv').options(
            header='true', inferschema='true').load(
                os.path.join(path_to_dir, '*.csv.processed'))

        # Drop the pref table, saving memory and filter out unsuccessful jobs as their information is not reliable
        task_df = task_df.drop('pref').filter(
            task_df.status == ":instance.status/success").drop(
                'status').cache()

        @F.pandas_udf(T.LongType(), F.PandasUDFType.SCALAR)
        def sub_two_datetimes(s1, s2):
            arr = []
            for i in s1.keys():
                d1 = datetime.datetime.strptime(s1[i],
                                                '%a %b %d %H:%M:%S %Z %Y')
                d2 = datetime.datetime.strptime(s2[i],
                                                '%a %b %d %H:%M:%S %Z %Y')
                arr.append(int((d2 - d1).total_seconds() * 1000))

            return pd.Series(arr)

        task_df = task_df \
            .withColumn('wait_time', sub_two_datetimes(F.col('submit-time'), F.col('start-time'))) \
            .withColumn('runtime', sub_two_datetimes(F.col('start-time'), F.col('end-time')))

        @F.pandas_udf(T.LongType(), F.PandasUDFType.SCALAR)
        def date_time_to_unix(series):
            arr = []
            epoch = datetime.datetime.utcfromtimestamp(0)
            for i in series.keys():
                arr.append(
                    np.int64((datetime.datetime.strptime(
                        series[i], '%a %b %d %H:%M:%S %Z %Y') -
                              epoch).total_seconds() * 1000))

            return pd.Series(arr)

        task_df = task_df.withColumn(
            'submit-time',
            date_time_to_unix(F.col('submit-time'))).withColumnRenamed(
                'submit-time',
                "ts_submit").drop('start-time').drop('end-time').cache()

        min_ts = task_df.agg({"ts_submit": "min"}).collect()[0][0]
        task_df = task_df.withColumn('ts_submit',
                                     F.col('ts_submit') - F.lit(min_ts))

        @F.pandas_udf(T.DoubleType(), F.PandasUDFType.SCALAR)
        def convert_to_kb(v):
            return v * 1024

        task_df = task_df.withColumn('memory', convert_to_kb(
            task_df.memory)).withColumnRenamed("memory", "memory_consumption")

        @F.pandas_udf(T.IntegerType(), F.PandasUDFType.SCALAR)
        def string_to_int(v):
            arr = []
            for i in v.keys():
                arr.append(mmh3.hash(v[i], signed=True))

            return pd.Series(arr)

        @F.pandas_udf(T.LongType(), F.PandasUDFType.SCALAR)
        def string_to_long(v):
            arr = []
            for i in v.keys():
                arr.append(mmh3.hash64(v[i], signed=True)[0])

            return pd.Series(arr)

        @F.pandas_udf(T.LongType(), F.PandasUDFType.SCALAR)
        def assign_workflow_ids(v):
            arr = []
            for i in v.keys():
                if v[i]:
                    arr.append(mmh3.hash64(v[i], signed=True)[0])
                else:
                    arr.append(
                        mmh3.hash64(uuid4().bytes, signed=True)
                        [0])  # Assign a UUID, collision chance is negligible.

            return pd.Series(arr)

        task_df = task_df.withColumn('user', string_to_int(
            task_df.user)).withColumnRenamed("user", "user_id")
        task_df = task_df.withColumn('job-uuid',
                                     string_to_long(
                                         F.col('job-uuid'))).withColumnRenamed(
                                             'job-uuid', 'task_id')

        type_udf = F.udf(lambda x: "Independent" if x is None else "Composite",
                         T.StringType())
        task_df = task_df.withColumn('type', type_udf(task_df.simset))

        task_df = task_df.withColumn('simset',
                                     assign_workflow_ids(
                                         F.col('simset'))).withColumnRenamed(
                                             'simset', "workflow_id")
        task_df = task_df.withColumnRenamed('cpu', 'resource_amount_requested')

        task_df = task_df.withColumnRenamed('instance', 'resource_used')

        # Set the static items that are not present in the trace
        task_df = task_df.withColumn('submission_site', F.lit(0))
        task_df = task_df.withColumn('parents',
                                     F.array().cast(T.ArrayType(T.LongType())))
        task_df = task_df.withColumn('children',
                                     F.array().cast(T.ArrayType(T.LongType())))
        task_df = task_df.withColumn('group_id', F.lit(0))
        task_df = task_df.withColumn('nfrs', F.lit("{}"))
        task_df = task_df.withColumn('params', F.lit("{}"))
        task_df = task_df.withColumn('memory_requested', F.lit(-1))
        task_df = task_df.withColumn('network_io_time', F.lit(-1))
        task_df = task_df.withColumn('disk_io_time', F.lit(-1))
        task_df = task_df.withColumn('disk_space_requested', F.lit(-1))
        task_df = task_df.withColumn('energy_consumption', F.lit(-1))

        os.makedirs(os.path.join(TARGET_DIR, Task.output_path()),
                    exist_ok=True)
        task_df.write.parquet(os.path.join(TARGET_DIR, Task.output_path()),
                              mode="overwrite",
                              compression="snappy")
        print("######\nDone parsing Tasks\n######")

    if not os.path.exists(os.path.join(TARGET_DIR, TaskState.output_path())):
        print("######\nStart parsing TaskState\n######")

        if 'task_df' not in locals():
            task_df = spark.read.parquet(
                os.path.join(TARGET_DIR, Task.output_path()))

        task_state_structtype = T.StructType([
            T.StructField("ts_start", T.LongType(), False),
            T.StructField("ts_end", T.LongType(), False),
            T.StructField("workflow_id", T.LongType(), False),
            T.StructField("task_id", T.LongType(), False),
            T.StructField("resource_id", T.LongType(), False),
            T.StructField("cpu_rate", T.DoubleType(), False),
            T.StructField("canonical_memory_usage", T.DoubleType(), False),
            T.StructField("assigned_memory", T.DoubleType(), False),
            T.StructField("minimum_memory_usage", T.DoubleType(), False),
            T.StructField("maximum_memory_usage", T.DoubleType(), False),
            T.StructField("disk_io_time", T.DoubleType(), False),
            T.StructField("maximum_disk_bandwidth", T.DoubleType(), False),
            T.StructField("local_disk_space_usage", T.DoubleType(), False),
            T.StructField("maximum_cpu_rate", T.DoubleType(), False),
            T.StructField("maximum_disk_io_time", T.DoubleType(), False),
            T.StructField("sample_rate", T.DoubleType(), False),
            T.StructField("sample_portion", T.DoubleType(), False),
            T.StructField("sampled_cpu_usage", T.DoubleType(), False),
            T.StructField("network_io_time", T.DoubleType(), False),
            T.StructField("maximum_network_bandwidth", T.DoubleType(), False),
        ])

        @F.pandas_udf(returnType=task_state_structtype,
                      functionType=F.PandasUDFType.GROUPED_MAP)
        def compute_task_states(df):
            workflow_id = df['workflow_id'].iloc[0]
            task_id = df['task_id'].iloc[0]
            ts_start = df['ts_submit'].min()
            ts_end = ts_start + df['runtime'].max()
            resource_id = df['resource_used'].iloc[0]
            cpu_rate = -1
            canonical_memory_usage = df['memory_consumption'].mean()
            assigned_memory = -1
            minimum_memory_usage = df['memory_consumption'].min()
            maximum_memory_usage = df['memory_consumption'].max()
            disk_io_time = -1
            maximum_disk_bandwidth = -1
            local_disk_space_usage = -1
            maximum_cpu_rate = -1
            maximum_disk_io_time = -1
            sample_rate = -1
            sample_portion = -1
            sampled_cpu_usage = -1
            network_io_time = -1
            maximum_network_bandwidth = -1

            data_dict = {
                "ts_start": ts_start,
                "ts_end": ts_end,
                "workflow_id": workflow_id,
                "task_id": task_id,
                "resource_id": resource_id,
                "cpu_rate": cpu_rate,
                "canonical_memory_usage": canonical_memory_usage,
                "assigned_memory": assigned_memory,
                "minimum_memory_usage": minimum_memory_usage,
                "maximum_memory_usage": maximum_memory_usage,
                "disk_io_time": disk_io_time,
                "maximum_disk_bandwidth": maximum_disk_bandwidth,
                "local_disk_space_usage": local_disk_space_usage,
                "maximum_cpu_rate": maximum_cpu_rate,
                "maximum_disk_io_time": maximum_disk_io_time,
                "sample_rate": sample_rate,
                "sample_portion": sample_portion,
                "sampled_cpu_usage": sampled_cpu_usage,
                "network_io_time": network_io_time,
                "maximum_network_bandwidth": maximum_network_bandwidth,
            }

            return pd.DataFrame(data_dict, index=[0])

        task_state_df = task_df.groupBy(['workflow_id',
                                         'task_id']).apply(compute_task_states)
        os.makedirs(os.path.join(TARGET_DIR, TaskState.output_path()),
                    exist_ok=True)
        task_state_df.write.parquet(os.path.join(TARGET_DIR,
                                                 TaskState.output_path()),
                                    mode="overwrite",
                                    compression="snappy")
        print("######\nDone parsing TaskState\n######")

    if not os.path.exists(os.path.join(TARGET_DIR, Resource.output_path())):
        print("######\nStart parsing Resources\n######")

        if 'task_df' not in locals():
            task_df = spark.read.parquet(
                os.path.join(TARGET_DIR, Task.output_path()))

        resource_id_column = [
            i.resource_used
            for i in task_df.select('resource_used').distinct().collect()
        ]

        resources = []
        for resource_id in resource_id_column:
            resources.append(
                Resource(resource_id, 'Cluster Node', 24, '', 256, -1, -1,
                         '').get_parquet_dict())

        resource_df = pd.DataFrame(resources)
        os.makedirs(os.path.join(TARGET_DIR, Resource.output_path()),
                    exist_ok=True)
        resource_df.to_parquet(os.path.join(TARGET_DIR, Resource.output_path(),
                                            'part.0.parquet'),
                               engine="pyarrow")
        print("######\nDone parsing Resources\n######")

    if not os.path.exists(os.path.join(TARGET_DIR, Workflow.output_path())):
        print("######\nStart parsing Workflows\n######")

        if 'task_df' not in locals():
            task_df = spark.read.parquet(
                os.path.join(TARGET_DIR, Task.output_path()))

        workflow_structype = T.StructType([
            T.StructField("id", T.LongType(), False),
            T.StructField("ts_submit", T.LongType(), False),
            T.StructField("task_count", T.IntegerType(), False),
            T.StructField("critical_path_length", T.LongType(), False),
            T.StructField("critical_path_task_count", T.IntegerType(), False),
            T.StructField("approx_max_concurrent_tasks", T.IntegerType(),
                          False),
            T.StructField("nfrs", T.StringType(), False),
            T.StructField("scheduler", T.StringType(), False),
            T.StructField("total_resources", T.DoubleType(), False),
            T.StructField("total_memory_usage", T.DoubleType(), False),
            T.StructField("total_network_usage", T.LongType(), False),
            T.StructField("total_disk_space_usage", T.LongType(), False),
            T.StructField("total_energy_consumption", T.LongType(), False),
        ])

        @F.pandas_udf(returnType=workflow_structype,
                      functionType=F.PandasUDFType.GROUPED_MAP)
        def compute_workflow_stats(df):
            id = df['workflow_id'].iloc[0]
            ts_submit = df['ts_submit'].min()
            task_count = len(df)
            critical_path_length = -1
            critical_path_task_count = -1
            approx_max_concurrent_tasks = -1
            nfrs = "{}"
            scheduler = "Cook"
            total_resources = df['resource_amount_requested'].sum()
            total_memory_usage = df['memory_consumption'].sum()
            total_network_usage = -1
            total_disk_space_usage = -1
            total_energy_consumption = -1

            data_dict = {
                "id": id,
                "ts_submit": ts_submit,
                'task_count': task_count,
                'critical_path_length': critical_path_length,
                'critical_path_task_count': critical_path_task_count,
                'approx_max_concurrent_tasks': approx_max_concurrent_tasks,
                'nfrs': nfrs,
                'scheduler': scheduler,
                'total_resources': total_resources,
                'total_memory_usage': total_memory_usage,
                'total_network_usage': total_network_usage,
                'total_disk_space_usage': total_disk_space_usage,
                'total_energy_consumption': total_energy_consumption
            }

            return pd.DataFrame(data_dict, index=[0])

        workflow_df = task_df.groupBy('workflow_id').apply(
            compute_workflow_stats)
        workflow_df.explain(True)
        workflow_df.write.parquet(os.path.join(TARGET_DIR,
                                               Workflow.output_path()),
                                  mode="overwrite",
                                  compression="snappy")
        print("######\nDone parsing Workflows\n######")

    print("######\nStart parsing Ẁorkload\n######")
    pandas_task_df = pd.read_parquet(os.path.join(TARGET_DIR,
                                                  Task.output_path()),
                                     engine="pyarrow")
    json_dict = Workload.get_json_dict_from_pandas_task_dataframe(
        pandas_task_df,
        domain="Industrial",
        start_date=None,
        end_date=None,
        authors=["Two Sigma"])

    os.makedirs(os.path.join(TARGET_DIR, Workload.output_path()),
                exist_ok=True)

    with open(
            os.path.join(TARGET_DIR, Workload.output_path(),
                         "generic_information.json"), "w") as file:
        # Need this on 32-bit python.
        def default(o):
            if isinstance(o, np.int64):
                return int(o)

        file.write(json.dumps(json_dict, default=default))
Example #20
0
###########################################################################################

df = spark.read.format("mongo").option('database', 'jamendo').option('collection', 'chords').load()
sample_300_df=df.select(["_id","chordRatio"]).limit(songs)

myFunc = f.udf(lambda array_to_list: [int(0) if e is None else int(1) for e in array_to_list], T.ArrayType(T.IntegerType()))
sample_300_df2=sample_300_df.withColumn('chordRatioMinHash', myFunc('chordRatio'))  
df0=sample_300_df2.select(["_id", "chordRatio", "chordRatioMinHash"])

from pyspark.sql import Row
from pyspark.sql.functions import col
from sparkaid import flatten
df0_flat=flatten(df0)
columns_list1=df0_flat.columns[1:-1]
array_df=df0_flat.select('_id', 'chordRatioMinHash',array(columns_list1).alias('chordRatioJS'))

#fill NaNs with zeros in the array column
df2_flat=df0_flat.na.fill(float(0))
columns_list2=df2_flat.columns[1:-1]
array_df2=df2_flat.select('_id', 'chordRatioMinHash',array(columns_list2).alias('chordRatioJS_no_Nulls'))

###
to_vector = udf(lambda a: Vectors.dense(a), VectorUDT())
data = array_df2.select('_id', 'chordRatioMinHash', "chordRatioJS_no_Nulls", to_vector("chordRatioJS_no_Nulls").alias("chordRatioWJS"))
data.show(1, truncate=False)

import scipy.sparse
from pyspark.ml.linalg import Vectors, _convert_to_vector, VectorUDT
from pyspark.sql.functions import udf, col
## from dense to sparse array
# %%
#initialize chains w/ context, response, sender, author_id, next
chains = data.filter((data.in_response_to_tweet_id=="none") & ~(data.response_tweet_id=="none"))\
    .select(
        functions.col('tweet_id').alias('response'),
        functions.col('response_tweet_id').alias('next'),
        'text',
        'author_id',
    )

chains = chains.withColumn('context',
                           functions.lit("").cast(types.StringType()))
chains = chains.withColumn('sender',
                           functions.lit("").cast(types.StringType()))
chains = chains.withColumn('rpos', functions.lit(1).cast(types.IntegerType()))
chains = chains.withColumn('tweets', functions.array().cast("array<string>"))

chains.persist()

#initialize empty samples DF w/ context, sender, response, author_id
fields = ['sender', 'context', 'response', 'author_id']
samples_schema = types.StructType([
    types.StructField(field_name, types.StringType(), True)
    for field_name in fields
])

samples = spark.createDataFrame([], samples_schema)
#%%
MAX_DEPTH = 20
depth = 0
Example #22
0
def columns_to_array_column(N):    
    '''N MH functions'''
    return array([col("_"+str(N)) for N in range(1, N+1)])
Example #23
0
def map_annotations_cols(dataframe: DataFrame,
                         f,
                         columns: list,
                         output_column: str,
                         annotatyon_type: str,
                         output_type: DataType = Annotation.arrayType()):
    """Creates a Spark UDF to map over multiple columns of Annotation results.

    Parameters
    ----------
    dataframe : DataFrame
        Input DataFrame
    f : function
        Function to apply to the column
    columns : list
        Name of the input column
    output_column : str
        Name of the output column
    annotatyon_type : str
        Annotator type
    output_type : DataType, optional
        Output type, by default Annotation.arrayType()

    Returns
    -------
    :class:`pyspark.sql.DataFrame`
        Transformed DataFrame

    Examples
    --------
    >>> from sparknlp.pretrained import PretrainedPipeline
    >>> from sparknlp.functions import *
    >>> explain_document_pipeline = PretrainedPipeline("explain_document_dl")
    >>> data = spark.createDataFrame([["U.N. official Ekeus heads for Baghdad."]]).toDF("text")
    >>> result = explain_document_pipeline.transform(data)
    >>> chunks_df = map_annotations_cols(
    ...     result,
    ...     lambda x: [
    ...         Annotation("tag", a.begin, a.end, a.result, a.metadata, a.embeddings)
    ...         for a in x
    ...     ],
    ...     ["pos", "ner"],
    ...     "tags",
    ...     "chunk"
    ... )
    >>> chunks_df.selectExpr("explode(tags)").show(truncate=False)
    +-------------------------------------------+
    |col                                        |
    +-------------------------------------------+
    |[tag, 0, 2, NNP, [word -> U.N], []]        |
    |[tag, 3, 3, ., [word -> .], []]            |
    |[tag, 5, 12, JJ, [word -> official], []]   |
    |[tag, 14, 18, NNP, [word -> Epeus], []]    |
    |[tag, 20, 24, VBZ, [word -> heads], []]    |
    |[tag, 26, 28, IN, [word -> for], []]       |
    |[tag, 30, 36, NNP, [word -> Baghdad], []]  |
    |[tag, 37, 37, ., [word -> .], []]          |
    |[tag, 0, 2, B-ORG, [word -> U.N], []]      |
    |[tag, 3, 3, O, [word -> .], []]            |
    |[tag, 5, 12, O, [word -> official], []]    |
    |[tag, 14, 18, B-PER, [word -> Ekeus], []]  |
    |[tag, 20, 24, O, [word -> heads], []]      |
    |[tag, 26, 28, O, [word -> for], []]        |
    |[tag, 30, 36, B-LOC, [word -> Baghdad], []]|
    |[tag, 37, 37, O, [word -> .], []]          |
    +-------------------------------------------+
    """
    return dataframe.withColumn(
        output_column,
        map_annotations_array(f, output_type)(array(*columns)).alias(
            output_column, metadata={'annotatorType': annotatyon_type}))
Example #24
0
@udf("double")
def PRODUCE_agg_acc_distance_percent(xs):
    if xs:
        temp = xs[-1]
        return temp


try:
    df_serve = df_serve.withColumn(
        "ts_acc_distance_percent",
        PRODUCE_ts_acc_distance_percent("ts_agg_acc_meters", "sd_len_route"))
except:
    a = [0.0, 0.0]
    df_serve = df_serve.withColumn("ts_acc_distance_percent",
                                   F.array([F.lit(x) for x in a]))
try:
    df_serve = df_serve.withColumn(
        "agg_acc_distance_percent",
        PRODUCE_agg_acc_distance_percent("ts_acc_distance_percent"))
except:
    df_serve = df_serve.withColumn("agg_acc_distance_percent", lit(0.0))

#---------------------------------------------------------------#
'''
Inference section
'''

#Clustering

save_path = "."
Example #25
0
def main(spark, measure):
    from extract import get_sample
    from mapping import get_dx_codes, get_comorbidity_cols
    from point_ranges import range_map

    # Make sure measure name is uppercase to match mappings
    measure_name = measure.upper()

    # Get mappings for diagnosis and comorbidity columns
    codes = get_dx_codes(spark)
    comorb_map = get_comorbidity_cols(spark)

    # Get data sample, join to dx codes, and filter to measure
    data = get_sample(spark)
    data = data.join(codes, 'diagnosis_code')
    data = data.filter(data.measure_name == measure_name)

    if data.count() > 0:
        # Convert comorbidity columns to ints and sum them
        com_cols = comorb_map.get(measure_name, [])
        for col in com_cols:
            data = data.withColumn(col, (functions.upper(
                data[col]) == functions.lit('YES')).cast(IntegerType()))
        data = data.withColumn('ComorbidityScore',
                               sum(data[x] for x in com_cols))

        # Reduce dataframe to relevant columns
        score_cols = [
            'LengthofStay', 'ED_visits', 'ComorbidityScore', 'Inpatient_visits'
        ]
        data = data.select(['encounter_id', 'patient_nbr'] + score_cols)

        # Assign point values for each of the score columns
        for col in score_cols:
            pts_col = col + '_pts'

            # Get ranges and point vals from range config
            attr_range = range_map.get(col)
            splits = [x[1] for x in attr_range] + [float("inf")]
            pts = functions.array([functions.lit(x[0]) for x in attr_range])

            # Transform data with bucketizer
            buckets = Bucketizer(splits=splits,
                                 inputCol=col,
                                 outputCol=pts_col)
            data = buckets.transform(data)

            # Turn bucket numbers into point values
            data = data.withColumn(
                pts_col, pts.getItem(data[pts_col].cast(IntegerType())))

        # Get LACE score for each row
        data = data.withColumn('LACEScore',
                               sum(data[x + '_pts'] for x in score_cols))

        # Calculate ratio score
        num = data.filter(data.LACEScore > 9).count()
        denom = data.count()
        return num / float(denom)
    else:
        return None
Example #26
0
def cal_performance(date, period, input_batch, output_batch):
    # 得到历史交易日
    hist_dt = fetch_com_dt_hist(date)
    pef_horizions = {
        '1w': hist_dt.loc['B1W'].strftime('%Y%m%d'),
        '1m': hist_dt.loc['B1M'].strftime('%Y%m%d'),
        '3m': hist_dt.loc['B3M'].strftime('%Y%m%d'),
        '6m': hist_dt.loc['B6M'].strftime('%Y%m%d'),
        '1y': hist_dt.loc['B1Y'].strftime('%Y%m%d'),
        '3y': hist_dt.loc['B3Y'].strftime('%Y%m%d'),
        '5y': hist_dt.loc['B5Y'].strftime('%Y%m%d')
    }
    if period == 'all':
        start = None
    elif period == '1w':
        start = pef_horizions[period]
    elif period == '1m':
        start = pef_horizions[period]
    elif period == '3m':
        start = pef_horizions[period]
    elif period == '6m':
        start = pef_horizions[period]
    elif period == '1y':
        start = pef_horizions[period]
    elif period == '3y':
        start = pef_horizions[period]
    elif period == '5y':
        start = pef_horizions[period]
    ss = SparkSession \
        .builder \
        .appName(app_name + '_' + str(date) + '_' + period + '_' + str(is_debug)) \
        .getOrCreate()
    ss.sparkContext.setLogLevel('WARN')
    # 从csv读取数据 并进行格式转换
    schema = StructType([
        StructField('date', TimestampType(), True),
        StructField('sec_id', StringType(), True),
        StructField('nav', FloatType(), True),
        StructField('ret', FloatType(), True),
        StructField('stock', FloatType(), True),
        StructField('treasury', FloatType(), True),
        StructField('credit', FloatType(), True),
        StructField('bench_ret', FloatType(), True),
        StructField('fnd_category', IntegerType(), True),
    ])
    # ret_all_spark_df = ss.read.csv(data_source_csv_path + date + '/' + str(input_batch) + '/ret_all.csv', header=True,
    #                                schema=schema)
    ret_all_spark_df = ss.read.csv(data_source_csv_path +
                                   '20200320/1/ret_all.csv',
                                   header=True,
                                   schema=schema)
    # debug模式下只取部分基金
    if is_debug:
        logging.info('use debug')
        # sec_id_list = ['000006JK', '000028JK', '000134JK', '000135JK']
        # sec_id_list = ['005503JK', '005368JK', '004892JK', '150066JK',
        #         '000189JK', '000270JK', '000327JK']
        # sec_id_list = ['150066JK']
        # sec_id_list = ['006382JK']
        today_spark_df = ret_all_spark_df.filter(
            ret_all_spark_df.date == datetime.strptime(date, '%Y%m%d'))
        rank_w = Window.orderBy('sec_id')
        today_spark_df = today_spark_df.withColumn(
            'row_no',
            func.row_number().over(rank_w))
        today_spark_df = today_spark_df.filter(
            today_spark_df.row_no <= 100).select('sec_id')
        ret_all_spark_df = ret_all_spark_df.join(today_spark_df,
                                                 on='sec_id',
                                                 how='inner')
        # ret_all_spark_df = ret_all_spark_df[ret_all_spark_df.sec_id.isin(sec_id_list)]
    else:
        logging.info('use release')
    # 只取date以前的数据  切片 date start是%Y%m%d 注意转换为timestamp
    ret_all_spark_df = ret_all_spark_df[
        ret_all_spark_df.date <= datetime.strptime(date, '%Y%m%d')]
    # date 小于最后一天
    w = Window.partitionBy('sec_id').orderBy('date').rowsBetween(
        Window.unboundedPreceding, Window.unboundedFollowing)
    ret_all_spark_df = ret_all_spark_df.withColumn('the_last_date',
                                                   func.last('date').over(w))
    ret_all_spark_df = ret_all_spark_df.where(
        ret_all_spark_df.the_last_date >= datetime.strptime(date, '%Y%m%d'))
    if period == 'all':
        # 自定义函数
        udf_mean = func.udf(lambda x: float(pd.Series(x).mean()), FloatType())
        udf_std = func.udf(lambda x: float(pd.Series(x).std()), FloatType())
        udf_min = func.udf(lambda x: float(pd.Series(x).min()), FloatType())
        udf_max = func.udf(lambda x: float(pd.Series(x).max()), FloatType())
        udf_p25 = func.udf(lambda x: float(pd.Series(x).quantile(0.25)),
                           FloatType())
        udf_median = func.udf(lambda x: float(pd.Series(x).median()),
                              FloatType())
        udf_p75 = func.udf(lambda x: float(pd.Series(x).quantile(0.75)),
                           FloatType())
        udf_skew = func.udf(lambda x: float(pd.Series(x).skew()), FloatType())
        udf_kurt = func.udf(lambda x: float(pd.Series(x).kurt()), FloatType())
        udf_start = func.udf(lambda x: str(x[0].strftime('%Y%m%d')),
                             StringType())
        udf_end = func.udf(lambda x: str(x[-1].strftime('%Y%m%d')),
                           StringType())
        udf_cagr = func.udf(lambda x: float(Measure.cal_cagr(pd.Series(x))),
                            FloatType())
        udf_cumret = func.udf(
            lambda x: float(Measure.cal_cumret(pd.Series(x))), FloatType())
        udf_standard_deviation = func.udf(
            lambda x: float(Measure.cal_standard_deviation(pd.Series(x))),
            FloatType())
        udf_max_drawdown = func.udf(
            lambda x, y: float(Measure.cal_max_drawdown(pd.Series(x, index=y))
                               ), FloatType())
        udf_sharpe = func.udf(
            lambda x: float(Measure.cal_sharpe(pd.Series(x))), FloatType())
        udf_downside_deviation = func.udf(
            lambda x: float(Measure.cal_downside_deviation(pd.Series(x))),
            FloatType())
        udf_alpha = func.udf(
            lambda x, y, z, w, f: float(
                Measure.cal_alpha(
                    pd.Series(x),
                    pd.DataFrame({
                        'stock': y,
                        'treasury': z,
                        'credit': w
                    }), f)), FloatType())
        udf_marketbeta = func.udf(
            lambda x, y: float(
                Measure.cal_marketbeta(pd.Series(x), pd.Series(y))),
            FloatType())
        udf_information = func.udf(
            lambda x, y: float(
                Measure.cal_information(pd.Series(x), pd.Series(y))),
            FloatType())
        udf_treynor = func.udf(
            lambda x, y: float(Measure.cal_treynor(pd.Series(x), pd.Series(y))
                               ), FloatType())
        # 过滤基金数据长度不够
        ret_all_spark_df = ret_all_spark_df.withColumn(
            'fund_length',
            func.count('date').over(w))
        ret_all_spark_df = ret_all_spark_df[
            ret_all_spark_df['fund_length'] >= 2]
        nt_val_spark_df = ret_all_spark_df[
            ret_all_spark_df.date == datetime.strptime(date, '%Y%m%d')].select(
                'sec_id', 'nav').withColumnRenamed('nav', 'nt_val')
        # 做一下排序 保证ret按date有序  否則date在collect_list后不是順序
        ret_all_spark_df = ret_all_spark_df.withColumn('ret_list',func.when(func.col('date') != func.col('the_last_date'), func.array(func.lit(0))).otherwise(func.collect_list('ret').over(w)))\
            .withColumn('stock_ret_list', func.when(func.col('date') != func.col('the_last_date'), func.array(func.lit(0))).otherwise(func.collect_list('stock').over(w)))\
            .withColumn('treasury_ret_list', func.when(func.col('date') != func.col('the_last_date'), func.array(func.lit(0))).otherwise(func.collect_list('treasury').over(w)))\
            .withColumn('credit_ret_list', func.when(func.col('date') != func.col('the_last_date'), func.array(func.lit(0))).otherwise(func.collect_list('credit').over(w)))\
            .withColumn('date_list', func.when(func.col('date') != func.col('the_last_date'), func.array(func.lit(datetime.strptime('2020-03-06','%Y-%m-%d')))).otherwise(func.collect_list('date').over(w)))
        nav_agg_part_1 = ret_all_spark_df[
            ret_all_spark_df.date == ret_all_spark_df.the_last_date].select(
                'sec_id', 'ret_list', 'stock_ret_list', 'treasury_ret_list',
                'credit_ret_list', 'date_list', 'fnd_category')
        if is_debug:
            nav_agg_part_1.show()
        # 后面不需要用到ret_all_spark_df 把所有列全部drop掉
        ret_all_spark_df = ret_all_spark_df.drop(
            'sec_id', 'date', 'nav', 'ret', 'stock', 'treasury', 'credit',
            'bench_ret', 'fnd_category', 'the_last_date', 'fund_length',
            'ret_list', 'stock_ret_list', 'treasury_ret_list',
            'credit_ret_list', 'date_list')
        if is_debug:
            ret_all_spark_df.show()
        # 取当前日净值
        nav_agg_part_1 = nav_agg_part_1.join(nt_val_spark_df,
                                             on=['sec_id'],
                                             how='left')
        nav_agg_part_1 = nav_agg_part_1.withColumn('ret_mean', udf_mean('ret_list')) \
            .withColumn('ret_std', udf_std('ret_list')) \
            .withColumn('ret_min', udf_min('ret_list')) \
            .withColumn('ret_max', udf_max('ret_list')) \
            .withColumn('ret_p25', udf_p25('ret_list')) \
            .withColumn('ret_median', udf_median('ret_list')) \
            .withColumn('ret_p75', udf_p75('ret_list')) \
            .withColumn('ret_skew', udf_skew('ret_list')) \
            .withColumn('ret_kurtosis', udf_kurt('ret_list')) \
            .withColumn('ret_start', udf_start('date_list')) \
            .withColumn('cagr_sf', udf_cagr('ret_list'))\
            .withColumn('cumret_sf', udf_cumret('ret_list'))\
            .withColumn('vol_sf', udf_standard_deviation('ret_list'))\
            .withColumn('md_sf', udf_max_drawdown('ret_list','date_list'))\
            .withColumn('sharpe_sf', udf_sharpe('ret_list'))\
            .withColumn('dvol_sf', udf_downside_deviation('ret_list'))\
            .withColumn('alpha_sf', udf_alpha('ret_list','stock_ret_list','treasury_ret_list','credit_ret_list','fnd_category'))\
            .withColumn('beta_sf', udf_marketbeta('ret_list','stock_ret_list'))\
            .withColumn('ir_sf', udf_information('ret_list','stock_ret_list'))\
            .withColumn('treynor_sf', udf_treynor('ret_list','stock_ret_list'))
        # drop 掉中间列
        nav_agg_part_1 = nav_agg_part_1.drop('ret_list', 'stock_ret_list',
                                             'treasury_ret_list',
                                             'credit_ret_list', 'date_list',
                                             'fnd_category')
        if is_debug:
            nav_agg_part_1.show()
        if is_write_file:
            nav_agg_part_1.write.option(
                'header',
                'true').mode('overwrite').csv(output_csv_path + str(date) +
                                              "/" + str(output_batch) + "/" +
                                              period)
    else:
        # 自定義函數
        udf_cagr = func.udf(lambda x: float(Measure.cal_cagr(pd.Series(x))),
                            FloatType())
        udf_cumret = func.udf(
            lambda x: float(Measure.cal_cumret(pd.Series(x))), FloatType())
        udf_aar = func.udf(lambda x: float(Measure.cal_aar(pd.Series(x))),
                           FloatType())
        udf_alpha = func.udf(
            lambda x, y, z, w, f: float(
                Measure.cal_alpha(
                    pd.Series(x),
                    pd.DataFrame({
                        'stock': y,
                        'treasury': z,
                        'credit': w
                    }), f)), FloatType())
        udf_standard_deviation = func.udf(
            lambda x: float(Measure.cal_standard_deviation(pd.Series(x))),
            FloatType())
        udf_downside_deviation = func.udf(
            lambda x: float(Measure.cal_downside_deviation(pd.Series(x))),
            FloatType())
        udf_max_drawdown = func.udf(
            lambda x, y: float(Measure.cal_max_drawdown(pd.Series(x, index=y))
                               ), FloatType())
        udf_marketbeta = func.udf(
            lambda x, y: float(
                Measure.cal_marketbeta(pd.Series(x), pd.Series(y))),
            FloatType())
        udf_var = func.udf(lambda x: float(Measure.cal_var(pd.Series(x))),
                           FloatType())
        udf_sharpe = func.udf(
            lambda x: float(Measure.cal_sharpe(pd.Series(x))), FloatType())
        udf_sortino = func.udf(
            lambda x: float(Measure.cal_sortino(pd.Series(x))), FloatType())
        udf_calmar = func.udf(
            lambda x: float(Measure.cal_calmar(pd.Series(x))), FloatType())
        udf_omega = func.udf(lambda x: float(Measure.cal_omega(pd.Series(x))),
                             FloatType())
        udf_information = func.udf(
            lambda x, y: float(
                Measure.cal_information(pd.Series(x), pd.Series(y))),
            FloatType())
        udf_treynor = func.udf(
            lambda x, y: float(Measure.cal_treynor(pd.Series(x), pd.Series(y))
                               ), FloatType())
        udf_m_square = func.udf(
            lambda x, y: float(Measure.cal_m_square(pd.Series(x), pd.Series(y))
                               ), FloatType())
        udf_sterling = func.udf(
            lambda x: float(Measure.cal_sterling(pd.Series(x))), FloatType())
        udf_burke = func.udf(lambda x: float(Measure.cal_burke(pd.Series(x))),
                             FloatType())
        udf_tail = func.udf(lambda x: float(Measure.cal_tail(pd.Series(x))),
                            FloatType())
        udf_rachev = func.udf(
            lambda x: float(Measure.cal_rachev(pd.Series(x))), FloatType())
        udf_stability = func.udf(
            lambda x: float(Measure.cal_stability(pd.Series(x))), FloatType())
        udf_min_monthly_return = func.udf(
            lambda x, y: float(
                Measure.cal_min_monthly_return(pd.Series(x, index=y))),
            FloatType())
        udf_max_monthly_return = func.udf(
            lambda x, y: float(
                Measure.cal_max_monthly_return(pd.Series(x, index=y))),
            FloatType())
        udf_monthly_odds = func.udf(
            lambda x, y: float(Measure.cal_monthly_odds(pd.Series(x, index=y))
                               ), FloatType())
        udf_picking = func.udf(
            lambda x, y: float(
                Measure.cal_picking(pd.Series(x), pd.Series(y, name='stock'))),
            FloatType())
        udf_timing = func.udf(
            lambda x, y: float(
                Measure.cal_timing(pd.Series(x), pd.Series(y, name='stock'))),
            FloatType())
        udf_trackerror = func.udf(
            lambda x, y, z: float(
                Measure.cal_trackerror(pd.Series(x), pd.Series(y), z)),
            FloatType())
        # 过滤出开始日期在基金发行日期之后的数据
        ret_all_spark_df = ret_all_spark_df.withColumn(
            'the_first_date',
            func.first('date').over(w))
        ret_all_spark_df = ret_all_spark_df[ret_all_spark_df.the_first_date <=
                                            datetime.strptime(start, '%Y%m%d')]
        # 取start日期之前的数据 切片
        ret_all_spark_df = ret_all_spark_df[
            ret_all_spark_df.date >= datetime.strptime(start, '%Y%m%d')]
        # 过滤基金数据长度不够 又进行了切片 所以需要重新统计基金长度
        ret_all_spark_df = ret_all_spark_df.withColumn(
            'fund_length',
            func.count('date').over(w))
        ret_all_spark_df = ret_all_spark_df[
            ret_all_spark_df['fund_length'] >= 2]
        # 做一下排序 保证ret按date有序  否則date在collect_list后不是順序
        ret_all_spark_df = ret_all_spark_df.withColumn('ret_list',func.when(func.col('date') != func.col('the_last_date'), func.array(func.lit(0))).otherwise(func.collect_list('ret').over(w)))\
            .withColumn('stock_ret_list', func.when(func.col('date') != func.col('the_last_date'), func.array(func.lit(0))).otherwise(func.collect_list('stock').over(w)))\
            .withColumn('treasury_ret_list', func.when(func.col('date') != func.col('the_last_date'), func.array(func.lit(0))).otherwise(func.collect_list('treasury').over(w)))\
            .withColumn('credit_ret_list', func.when(func.col('date') != func.col('the_last_date'), func.array(func.lit(0))).otherwise(func.collect_list('credit').over(w))) \
            .withColumn('bench_ret_list', func.when(func.col('date') != func.col('the_last_date'), func.array(func.lit(0))).otherwise(func.collect_list('bench_ret').over(w))) \
            .withColumn('date_list', func.when(func.col('date') != func.col('the_last_date'), func.array(func.lit(datetime.strptime('2020-03-06','%Y-%m-%d')))).otherwise(func.collect_list('date').over(w)))
        nav_agg_part_2 = ret_all_spark_df[
            ret_all_spark_df.date == ret_all_spark_df.the_last_date].select(
                'sec_id', 'ret_list', 'stock_ret_list', 'treasury_ret_list',
                'credit_ret_list', 'bench_ret_list', 'date_list',
                'fnd_category')
        if is_debug:
            nav_agg_part_2.show()
        # 后面不需要用到ret_all_spark_df 把所有列全部drop掉
        ret_all_spark_df = ret_all_spark_df.drop(
            'sec_id', 'date', 'nav', 'ret', 'stock', 'treasury', 'credit',
            'bench_ret', 'fnd_category', 'the_last_date', 'the_first_date',
            'fund_length', 'ret_list', 'stock_ret_list', 'treasury_ret_list',
            'credit_ret_list', 'bench_ret_list', 'date_list')
        if is_debug:
            ret_all_spark_df.show()
        nav_agg_part_2 = nav_agg_part_2.withColumn('cagr_' + period, udf_cagr('ret_list'))\
            .withColumn('cumret_' + period, udf_cumret('ret_list'))\
            .withColumn('aar_' + period, udf_aar('ret_list'))\
            .withColumn('alpha_' + period, udf_alpha('ret_list','stock_ret_list','treasury_ret_list','credit_ret_list', 'fnd_category'))\
            .withColumn('vol_' + period, udf_standard_deviation('ret_list'))\
            .withColumn('dvol_' + period, udf_downside_deviation('ret_list'))\
            .withColumn('md_' + period, udf_max_drawdown('ret_list', 'date_list'))\
            .withColumn('beta_' + period, udf_marketbeta('ret_list', 'stock_ret_list'))\
            .withColumn('var_' + period, udf_var('ret_list'))\
            .withColumn('sharpe_' + period, udf_sharpe('ret_list'))\
            .withColumn('sortino_' + period, udf_sortino('ret_list'))\
            .withColumn('calmar_' + period, udf_calmar('ret_list'))\
            .withColumn('omega_' + period, udf_omega('ret_list'))\
            .withColumn('ir_' + period, udf_information('ret_list','stock_ret_list'))\
            .withColumn('treynor_' + period, udf_treynor('ret_list','stock_ret_list'))\
            .withColumn('m_square_' + period, udf_m_square('ret_list','stock_ret_list'))\
            .withColumn('sterling_' + period, udf_sterling('ret_list'))\
            .withColumn('burke_' + period, udf_burke('ret_list'))\
            .withColumn('tail_' + period, udf_tail('ret_list'))\
            .withColumn('rachev_' + period, udf_rachev('ret_list'))\
            .withColumn('stability_' + period, udf_stability('ret_list'))
        if period in ['3m', '6m', '1y', '3y', '5y']:
            nav_agg_part_2 = nav_agg_part_2.withColumn('min_monthly_ret_' + period, udf_min_monthly_return('ret_list','date_list'))\
                .withColumn('max_monthly_ret_' + period, udf_max_monthly_return('ret_list','date_list'))\
                .withColumn('monthly_odds_' + period, udf_monthly_odds('ret_list', 'date_list'))
        if period in ['1m', '3m', '6m', '1y', '3y', '5y']:
            nav_agg_part_2 = nav_agg_part_2.withColumn('picking_' + period, udf_picking('ret_list', 'stock_ret_list'))\
                .withColumn('timing_' + period, udf_timing('ret_list', 'stock_ret_list'))\
                .withColumn('te_' + period, udf_trackerror('ret_list', 'bench_ret_list', 'fnd_category'))
        # drop 掉中间列表
        nav_agg_part_2 = nav_agg_part_2.drop('ret_list', 'stock_ret_list',
                                             'treasury_ret_list',
                                             'credit_ret_list',
                                             'bench_ret_list', 'date_list',
                                             'fnd_category')
        if is_debug:
            nav_agg_part_2.show()
        if is_write_file:
            nav_agg_part_2.write.option(
                'header',
                'true').mode('overwrite').csv(output_csv_path + str(date) +
                                              "/" + str(output_batch) + '/' +
                                              period)
Example #27
0
# %%
# Extract event sequences and groundtruth

udf_normalize = F.udf(
    lambda x: [[
        (x[i][0] - x[0][0] + (x[-1][0] - x[0][0]) /
         (len(x) - 1)) / args.time_divisor,
        float(x[i][1]),
    ] for i in range(len(x))],
    psql.types.ArrayType(psql.types.ArrayType(psql.types.FloatType())),
)

with Timer("extract event sequences"):
    event_seqs = (df_filtered.withColumn(
        "phrase", F.explode("phrases")).withColumn(
            "event", F.array("ts", "type")).groupby("phrase").agg(
                F.array_sort(
                    F.collect_set("event")).alias("event_seq")).filter(
                        F.size("event_seq").between(
                            args.min_seq_length,
                            args.max_seq_length)).withColumn(
                                "event_seq",
                                udf_normalize("event_seq"))).persist()

event_seqs.limit(5).toPandas()

# seq_lengths = (
#     event_seqs.select("phrase", F.size("event_seq").alias("size"))
#     .groupby("size")
#     .count()
#     .sort("size")
 def data2(self):
     return self.spark.range(10).toDF('id') \
         .withColumn("ks", array([lit(i) for i in range(20, 30)])) \
         .withColumn("k", explode(col('ks'))) \
         .withColumn("v2", col('k') * 100) \
         .drop('ks')
 def data(self):
     return (self.spark.range(10).toDF("id").withColumn(
         "vs", array([lit(i) for i in range(20, 30)
                      ])).withColumn("v", explode(col("vs"))).drop("vs"))
Example #30
0
                            # sf.coalesce(sf.col(rule+"_ECOA"),sf.lit("$$$")), sf.lit("^"),\
                            sf.coalesce(sf.col(rule+"_ECOA_code"),sf.lit("$$$")), sf.lit("^"),\
                            # sf.coalesce(sf.col(rule+"_Evaluation"),sf.lit("$$$")), sf.lit("^"),\
                            sf.coalesce(sf.col(rule+"_Evaluation_code"),sf.lit("$$$")), sf.lit("^"),\
                            sf.coalesce(sf.col(rule+"_FilingDate"),sf.lit("$$$")), sf.lit("^"),\
                            sf.coalesce(sf.col(rule+"_PlaintiffName"),sf.lit("$$$")), sf.lit("^"),\
                            sf.coalesce(sf.col(rule+"_ReferenceNumber"),sf.lit("$$$")), sf.lit("^"),\
                            # sf.coalesce(sf.col(rule+"_Status"),sf.lit("$$$")), sf.lit("^"),\
                            sf.coalesce(sf.col(rule+"_StatusDate"),sf.lit("$$$")), sf.lit("^"),\
                            sf.coalesce(sf.col(rule+"_Status_code"),sf.lit("$$$")), sf.lit("^")\
                            ) \
                            )

#dfcsPublicRecordInt.show(2,False)

dfcsPublicRecordInt = dfcsPublicRecordInt.withColumn( "csPublicRecordArray", sf.array([col for col in dfcsPublicRecordInt.columns if col.split("_")[0] == "newcsPublicRecord"]) )

dfexplode = dfcsPublicRecordInt.select(sf.col("id"),sf.col("year"),sf.col("month"),sf.col("day") \
            ,sf.explode_outer("csPublicRecordArray").alias("csPublicRecord") \
            )

dfsplitCol = dfexplode.withColumn("csPublicRecord_Amount",sf.split("csPublicRecord","\^")[0]) \
                     .withColumn("csPublicRecord_Bankruptcy_AdjustmentPercent",sf.split("csPublicRecord","\^")[1]) \
                     .withColumn("csPublicRecord_Bankruptcy_AssetAmount",sf.split("csPublicRecord","\^")[2]) \
                     .withColumn("csPublicRecord_Bankruptcy_LiabilitiesAmount",sf.split("csPublicRecord","\^")[3]) \
                     .withColumn("csPublicRecord_Bankruptcy_RepaymentPercent",sf.split("csPublicRecord","\^")[4]) \
                     .withColumn("csPublicRecord_Bankruptcy_Type", sf.lit(None).cast(StringType())) \
                     .withColumn("csPublicRecord_Bankruptcy_Type_code",sf.split("csPublicRecord","\^")[5]) \
                     .withColumn("csPublicRecord_BookPageSequence",sf.split("csPublicRecord","\^")[6]) \
                     .withColumn("csPublicRecord_ConsumerComment",sf.split("csPublicRecord","\^")[7]) \
                     .withColumn("csPublicRecord_Court", sf.lit(None).cast(StringType())) \
Example #31
0
def main_sdg(spark=None):
    spark = spark
    # Leemos fichero de metadatos
    with open(f'{wd}' + '/data/sdg_metadata.json', 'r') as json_metadata:
        metadata = json.load(json_metadata)
    # Leemos fichero de entrada
    person_inputs = metadata['dataflows'][0]['sources'][0]
    person_inputs = glob.glob(f'{wd}' + person_inputs['path'])
    # Leemos fichero json de entrada
    with open(person_inputs[0], 'r') as json_entrada:
        entrada = json.load(json_entrada)
        df = spark.createDataFrame(entrada)
    # Inicializacion dataframes
    df_ok = None
    df_not_ok = None
    # Lista con todas las transformaciones
    transformaciones = metadata['dataflows'][0]['transformations']
    # Proceso ejecucion
    for item0 in transformaciones:
        if item0['type'] == 'validate_fields':
            validaciones = item0['params']['validations']
            for item1 in validaciones:
                validaciones_campo = item1['validations']
                dicc = {}
                for item2 in validaciones_campo:
                    df = df.withColumn(
                        item1['field'] + '_' + item2,
                        F.udf(validaciones_func)(F.col(item1['field']),
                                                 F.lit(item2)))
            lista_cols_nuevas = [x for x in df.columns[3:]]
            df = df.withColumn('total',
                               F.udf(suma_bool)(F.array(lista_cols_nuevas)))
            df_ok = df.filter(F.col('total') == True).drop('total')
            df_not_ok = df.filter(F.col('total') == False).drop('total')
            for item in lista_cols_nuevas:
                df_not_ok = df_not_ok.withColumn(
                    'code_' + item,
                    F.when(
                        F.col(item) == False,
                        F.udf(crear_diccionario)(
                            F.lit(item.split("_")[0]),
                            F.lit(item.split("_")[1]))).otherwise(F.lit(None)))
            lista_cols_nuevas_code = [
                x for x in df_not_ok.columns if 'code_' in x
            ]
            df_not_ok = df_not_ok.withColumn(
                'code_total',
                F.udf(list_total)(F.array(lista_cols_nuevas_code)))
            lista_cols_borrar = lista_cols_nuevas + lista_cols_nuevas_code
            df_not_ok = df_not_ok.drop(*lista_cols_borrar)
        if (item0['type'] == 'add_fields') & (item0['params']['input']
                                              == 'validation_ok'):
            for item3 in item0['params']['addFields']:
                tipo = item3['function']
                df_ok = df_ok.withColumn(item3['name'],
                                         F.udf(anadir_campos)(F.lit(tipo)))
        if (item0['type'] == 'add_fields') & (item0['params']['input']
                                              == 'validation_ko'):
            for item3 in item0['params']['addFields']:
                tipo = item3['function']
                df_not_ok = df_not_ok.withColumn(
                    item3['name'],
                    F.udf(anadir_campos)(F.lit(tipo), F.col('code_total')))

    # Dataframes finales y escritura a archivos .json en disco
    sinks = metadata['dataflows'][0]['sinks']
    try:
        df_ok = df_ok.select('name', 'age', 'office', 'dt')
    except Exception as e:
        logging.info(e)
        pass
    try:
        df_not_ok = df_not_ok.select('name', 'age', 'office', 'dt',
                                     'arraycoderrorbyfield')
    except Exception as e:
        logging.info(e)
        pass
    escritura(df_ok, sinks, 'ok_with_date', f'{wd}')
    escritura(df_not_ok, sinks, 'validation_ko', f'{wd}')
Example #32
0
def test_auto_mapper_fhir_patient_resource_include_null_properties(
    spark_session: SparkSession,
) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", "1970-01-01", "female"),
            (2, "Vidal", "Michael", "1970-02-02", None),
        ],
        ["member_id", "last_name", "first_name", "date_of_birth", "my_gender"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(
        view="members", source_view="patients", keys=["member_id"]
    ).complex(
        Patient(
            id_=FhirId(A.column("member_id")),
            birthDate=A.date(A.column("date_of_birth")),
            name=FhirList(
                [HumanName(use=NameUseCode("usual"), family=A.column("last_name"))],
                include_null_properties=True,
            ),
            gender=A.if_not_null(
                A.column("my_gender"), AdministrativeGenderCode(A.column("my_gender"))
            ),
        )
    )

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    assert len(sql_expressions) == 21
    assert str(sql_expressions["id"]) == str(
        substring(
            regexp_replace(col("b.member_id"), r"[^A-Za-z0-9\-\.]", "_"), 0, 63
        ).alias("id")
    )
    assert str(sql_expressions["resourceType"]) == str(
        lit("Patient").alias("resourceType")
    )
    assert str(sql_expressions["birthDate"]) == str(
        coalesce(
            to_date(col("b.date_of_birth"), "y-M-d"),
            to_date(col("b.date_of_birth"), "yyyyMMdd"),
            to_date(col("b.date_of_birth"), "M/d/y"),
        ).alias("birthDate")
    )
    assert str(sql_expressions["name"]) == str(
        filter(
            array(
                struct(
                    lit("usual").alias("use"),
                    lit(None).alias("text"),
                    col("b.last_name").alias("family"),
                    lit(None).alias("given"),
                    lit(None).alias("prefix"),
                    lit(None).alias("suffix"),
                    lit(None).alias("period"),
                )
            ),
            lambda x: x.isNotNull(),
        ).alias("name")
    )
    assert str(sql_expressions["gender"]) == str(
        when(col("b.my_gender").isNull(), None)
        .otherwise(col("b.my_gender"))
        .alias("gender")
    )

    result_df.printSchema()
    result_df.show()

    assert (
        result_df.where("member_id == 1").selectExpr("name[0].use").collect()[0][0]
        == "usual"
    )
    assert (
        result_df.where("member_id == 1").selectExpr("name[0].family").collect()[0][0]
        == "Qureshi"
    )

    assert (
        result_df.where("member_id == 2").selectExpr("name[0].use").collect()[0][0]
        == "usual"
    )
    assert (
        result_df.where("member_id == 2").selectExpr("name[0].family").collect()[0][0]
        == "Vidal"
    )
Example #33
0
def create_word_vecs(
    df,
    word_col,
    desired_ops,
    word_vec_col="word_vec",
    normalize=False,
    offset_vals=None,
    scale_vals=None,
    clip_rng=None,
    ndigits=None,
):
    """
    Creates a "word vec" from "word".

    A word is a list of N coordinates that may or may not be consecutive, and
    a word vecs is a set of numbers that represents a word.

    Normalizers standardises word vecs by

                                    (original value) - (offset_val)
         (standardized value)  = ------------------------------------
                                              (scale_val)

    Currently only supports 1 normalization method:
    - mean-MAD: offset_val is the mean and scale_val is the MAD


    Paramters
    ---------
    df: A pyspark.sql.dataframe.DataFrame.
    word_col: String.
              Name of the column that contains a word.
    desired_ops: List of tuples, or list of list of tuples.
                 A list of operations to execute on df to get the word vec. Each
                 tuple (OP_NAME, i, j, ...) is an operation where
                   - OP_NAME = name of the operation used in ops_dict,
                   - i, j, ... = parameters to the lambda function for OP_NAME
                 Tuples in the same list are normalized together. See ops_dict
                 in the code for more detail.

    word_vec_col: String.
                  Name of the new word vec column.

    normalize: One of {False, 'mean-mad'}.
               How to normalize word vec.
    offset_vals: List of floats (optional).
                 The offset value for each component in the word vec (used for inferencing).
    scale_vals: List of floats (optional)
                The scale value for each component in the word vec (used for inferencing).
    clip_rng: Tuple of two integers (optional).
              The (min, max) range to clip each component in the word vec. Values
              smaller than min are repaced with min; values greater than max are capped at max.
    ndigits: Integer (optional).
             Number of decimal digits to round each component in the word vec to.


    Returns
    -------
    Three objects:
    - A pyspark.sql.dataframe.DataFrame with the word column replaced by a new word vec column,
    - offset_vals (list of floats)
    - scale_vals (list of floats)
    """

    # Check normalize parameter
    assert normalize in {False, "mean-mad"}

    othercols = list(set(df.columns) - {word_col})

    # ks are kx, ky, the projection multipliers
    with_ks_df = cheap_ruler(df, word_col)

    # A dictionary of operations (features).
    # Must return operations as a single-item list, or else will run into error.
    ops_dict = {
        # Distance project to E-W
        "dx":
        lambda ii, ff: [(with_ks_df[word_col][ff][0] - with_ks_df[word_col][ii]
                         [0]) * with_ks_df.kx],
        # Distance projected to N-S
        "dy":
        lambda ii, ff: [(with_ks_df[word_col][ff][1] - with_ks_df[word_col][ii]
                         [1]) * with_ks_df.ky],
        # Distance
        "d":
        lambda ii, ff: [
            vsqrt(
                vpow(
                    (with_ks_df[word_col][ff][0] - with_ks_df[word_col][ii][0])
                    * with_ks_df.kx,
                    2,
                ) + vpow(
                    (with_ks_df[word_col][ff][1] - with_ks_df[word_col][ii][1])
                    * with_ks_df.ky,
                    2,
                ))
        ],
        # Altitude
        "al":
        lambda ii, ff:
        [with_ks_df[word_col][ff][2] - with_ks_df[word_col][ii][2]],
        # Duration
        "t":
        lambda ii, ff:
        [with_ks_df[word_col][ff][3] - with_ks_df[word_col][ii][3]],
        # Speed
        "s":
        lambda ii, ff: [(vsqrt(
            vpow(
                (with_ks_df[word_col][ff][0] - with_ks_df[word_col][ii][0]) *
                with_ks_df.kx,
                2,
            ) + vpow(
                (with_ks_df[word_col][ff][1] - with_ks_df[word_col][ii][1]) *
                with_ks_df.ky,
                2,
            )) / (with_ks_df[word_col][ff][3] - with_ks_df[word_col][ii][3]))],
    }

    # Given a list of desired operations, find the operation in a dictionary and
    # add to an execution plan. Retain group structure for multiple-column
    # normalization.
    ops = []  # a list of operations to be added to the execution plan
    col_grps = [
    ]  # a list of column names that stores the result of those operations
    for i, op_grp in enumerate(desired_ops):
        col_grp = []
        for j, op in enumerate(op_grp):
            op_name, ii, ff = op

            # Column to store result of operation
            col_name = "_" + str(i) + "_" + str(j)
            col_grp += (col_name, )

            # Find the operation in the dictionary and add to ops
            ops += (ops_dict[op_name](ii, ff)[0].alias(col_name), )

        col_grps += (col_grp, )

    # Flatten the list
    word_vec_cols = [c for grp in col_grps for c in grp]
    with_raw_word_vecs = with_ks_df.select(*othercols, *ops)

    # Normalize
    if normalize:
        if scale_vals is None or offset_vals is None:
            # Compute mean and mad for every group
            offset_vals = []
            scale_vals = []
            for grp in col_grps:
                mu = compute_mean(with_raw_word_vecs, grp)
                mad = compute_mad(with_raw_word_vecs, grp, mean_val=mu)
                offset_vals += [mu] * len(grp)
                scale_vals += [mad] * len(grp)

        scale_ops = []
        scaled_word_vec_cols = []
        for i, cname in enumerate(word_vec_cols):
            scaled_cname = cname + "_scaled"
            scale_ops += (((with_raw_word_vecs[cname] - offset_vals[i]) /
                           scale_vals[i]).alias(scaled_cname), )
            scaled_word_vec_cols += (scaled_cname, )

        with_word_vecs = with_raw_word_vecs.select(*othercols, *scale_ops)

        # Clip features
        if clip_rng is not None:
            with_word_vecs = clip(with_word_vecs, scaled_word_vec_cols,
                                  clip_rng)

        word_vec_cols = scaled_word_vec_cols

    else:
        with_word_vecs = with_raw_word_vecs
        offset_vals = None
        scale_vals = None

    # Round to desired decimal digits
    if ndigits is not None:
        with_word_vecs = round_columns(with_word_vecs,
                                       word_vec_cols,
                                       decimals=ndigits)

    # Combine columns into a vec
    res_df = with_word_vecs.select(
        *othercols,
        array(*(with_word_vecs[col]
                for col in word_vec_cols)).alias(word_vec_col))

    return res_df, offset_vals, scale_vals
Example #34
0
 def data(self):
     return self.spark.range(10).toDF('id') \
         .withColumn("vs", array([lit(i * 1.0) + col('id') for i in range(20, 30)])) \
         .withColumn("v", explode(col('vs'))) \
         .drop('vs') \
         .withColumn('w', lit(1.0))
    def gngraph_datarepo_qry_getedges(self, dnodeDF, sqlst, nodemode):

        try:
            # first map gnedges
            gnsrch_log(
                'GnSrchOps:datanodes  querying for edges and derived nodes flages '
            )
            self.get_metaedges_mapped_df()
            self.get_metanodes_mapped_df()

            gnsrch_log('GnSrchOps: Enumerating edges for datanodes on join ')
            cond = [
                ((self.__gnmetaEdgesDF_cached.gntgtnodeid == dnodeDF.gnnodeid)
                 |
                 (self.__gnmetaEdgesDF_cached.gnsrcnodeid == dnodeDF.gnnodeid))
                & (self.__gnmetaEdgesDF_cached.gnedgetype == 'GNDataNodeEdge')
            ]

            jDF = self.__gnmetaEdgesDF_cached.join(dnodeDF, cond, 'inner')
            jDF.show(4)

            e1DF = jDF.select("gnedgeid", "gnedgename", "gnedgetype",
                              "gnsrcnodeid", "gntgtnodeid")

            eDF = e1DF.dropDuplicates(['gnedgeid']).sort('gnedgeid')
            ecount = eDF.count()
            gnsrch_log('GnSrchOps: showing unique edges #nodes ' + str(ecount))
            eDF.show(5)

            mcols = [F.col("gnsrcnodeid"), F.col("gntgtnodeid")]
            res = eDF.withColumn("edgenodes", F.array(mcols))\
                  .select("edgenodes")
            gnsrch_log('GnSrchOps: gnedges filter result 1 ')
            res.show(5)

            f1DF = res.select(F.explode(F.col("edgenodes")).alias("gnnodeid"))
            f1count = f1DF.count()
            gnsrch_log('GnSrchOps: Filter datanodes exploded #nodes ' +
                       str(f1count))
            f1DF.show(10)

            gnsrch_log('GnSrchOps: Filtered datanodes and remove duplicates ')
            f2DF = f1DF.select("gnnodeid").distinct().sort("gnnodeid")
            f2count = f2DF.count()
            gnsrch_log('GnSrchOps: Filter nodes and distict #nodes ' +
                       str(f2count))
            f2DF.show(10)

            derivedNodeDF = dnodeDF.select("gnnodeid").join(
                f2DF, on=['gnnodeid'],
                how='left_anti').distinct().orderBy('gnnodeid')

            gnsrch_log('GnSrchOps: Enumerating derived datanodes ')

            nderivedNodes = derivedNodeDF.count()
            gnsrch_log('GnSrchOps: derived datanodes #of nodes ' +
                       str(nderivedNodes))
            dnJson = {}
            dnDF = None
            if (nderivedNodes > 0):
                derivedNodeDF.show(10)
                deriveNodeList = derivedNodeDF.collect()
                derived_NodeList = []
                for row in derivedNodeList:
                    ####print(row['fnodes'])
                    derived_NodeList.append(row['gnnodeid'])
                ### now iterate over list and get gnnode
                gnsrch_log('GnSrchOps: Node info for derived datanodes ')
                gnsrch_log(derived_NodeList)
                nodelist = []
                nodeid_list = "( "

                i = 0
                for x in derived_NodeList:
                    if (i > 0):
                        nodeid_list += ","
                    nodeid_list += "" + str(x) + ""
                    i = i + 1
                nodeid_list += ")"
                gnsrch_log('GnSrchOps: Getting node info for list ' +
                           nodeid_list)
                sqlstr = "SELECT * from gnmetanodes where gnnodeid in " + nodeid_list + " "
                gnsrch_log('GnGraphSearchOps: executing sql ' + sqlstr)
                dnDF = self.__spark.sql(sqlstr)
                #resJson = jDF.toJSON().map(lambda j: json.loads(j)).collect()
                ###dnJson = dnDF.toJSON().collect()
                dnCount = dnDF.count()
                gnsrch_log('GnSrchOps: Derived datanodes enumerated #nodes ' +
                           str(dnCount))
        except Exception as err:
            gnsrch_log('GnSrchOps: Exception received ' + str(err))
            eDF = None
            dbDF = None

        ####print(nodelist)
        gnsrch_log('GnSrchOps: Completed datanodes gnedges fetch ')
        return (eDF, dnDF)
Example #36
0
 def test_smvIsAnyIn(self):
     df = self.createDF("k:String; v:String;", "a,b;c,d;,").select(array(col("k"), col("v")).alias("arr"))
     res = df.select(col("arr").smvIsAnyIn("a", "z").alias("isFound"))
     expected = self.createDF("isFound:Boolean", "true;false;false")
     self.should_be_same(expected,res)
Example #37
0
 def test_smvGetColName(self):
     df = self.createDF("k:String; v:String;", "a,b;c,d;,")
     self.assertEqual(df.k.smvGetColName(), 'k')
     self.assertEqual(array(df.k, df.v).smvGetColName(), 'array(k,v)')
 def data(self):
     return self.spark.range(10).toDF('id') \
         .withColumn("vs", array([lit(i) for i in range(20, 30)])) \
         .withColumn("v", explode(col('vs'))).drop('vs')
Example #39
0
 def with_explode_column(df):
     import pyspark.sql.functions as F
     df2 = df.withColumn('values', F.array(F.lit(1), F.lit(2)))
     df2 = df2.withColumn('value', F.explode(df2.values))
     return df2