# schema used to parse tweets json.
tweet_schema = StructType([
    StructField("created_at", StringType(), nullable=True),
    StructField("text", StringType(), nullable=True),
    StructField("place",
                StructType([
                    StructField("name", StringType(), nullable=True),
                    StructField("country_code", StringType(), nullable=True)
                ]),
                nullable=True),
    StructField("user",
                StructType([
                    StructField("location", StringType(), nullable=True),
                    StructField("created_at", StringType(), nullable=True),
                    StructField("updateTime", StringType(), nullable=True)
                ]),
                nullable=True),
    StructField("entities",
                StructType([
                    StructField("hashtags",
                                ArrayType(
                                    StructType([
                                        StructField("text",
                                                    StringType(),
                                                    nullable=True)
                                    ])),
                                nullable=True)
                ]),
                nullable=True)
])
Beispiel #2
0
    def to_array(col):
        def to_array_(v):
            return v.toArray().tolist()

        return udf(to_array_, ArrayType(DoubleType()))(col)
Beispiel #3
0
def preprocess_file(bucket_name, file_name):

    raw_data = sql_context.read.json("s3a://{0}/{1}".format(
        bucket_name, file_name))

    # Clean question body
    if (config.LOG_DEBUG):
        print(colored("[PROCESSING]: Cleaning question body...", "green"))
    clean_body = udf(lambda body: filter_body(body), StringType())
    partially_cleaned_data = raw_data.withColumn("cleaned_body",
                                                 clean_body("body"))

    # Concat cleaned question body and question title to form question vector
    if (config.LOG_DEBUG):
        print(
            colored(
                "[PROCESSING]: Concating question body and question title...",
                "green"))
    data = partially_cleaned_data.withColumn(
        "text_body", concat(col("title"), lit(" "), col("body")))

    # Tokenize question title
    if (config.LOG_DEBUG):
        print(colored("[PROCESSING]: Tokenizing text vector...", "green"))
    tokenizer = Tokenizer(inputCol="text_body",
                          outputCol="text_body_tokenized")
    tokenized_data = tokenizer.transform(data)

    # Remove stop words
    if (config.LOG_DEBUG):
        print(colored("[PROCESSING]: Removing stop words...", "green"))
    stop_words_remover = StopWordsRemover(
        inputCol="text_body_tokenized",
        outputCol="text_body_stop_words_removed")
    stop_words_removed_data = stop_words_remover.transform(tokenized_data)

    # Stem words
    if (config.LOG_DEBUG):
        print(colored("[PROCESSING]: Stemming tokenized vector...", "green"))
    stem = udf(lambda tokens: lemmatize(tokens), ArrayType(StringType()))
    stemmed_data = stop_words_removed_data.withColumn(
        "text_body_stemmed", stem("text_body_stop_words_removed"))

    # Shingle resulting body
    # if (config.LOG_DEBUG): print(colored("[PROCESSING] Shingling resulting text body...", "green"))
    # shingle = udf(lambda tokens: get_two_gram_shingles(tokens), ArrayType(ArrayType(StringType())))
    # shingled_data = stemmed_data.withColumn("text_body_shingled", shingle("text_body_stemmed"))

    # Extract data that we want
    final_data = stemmed_data
    final_data.registerTempTable("final_data")

    preprocessed_data = sql_context.sql(
        "SELECT title, body, creation_date, text_body, text_body_stemmed, post_type_id, tags, score, comment_count, view_count, id from final_data"
    )

    # Write to AWS
    if (config.LOG_DEBUG):
        print(colored("[UPLOAD]: Writing preprocessed data to AWS...",
                      "green"))
    write_aws_s3(config.S3_BUCKET_BATCH_PREPROCESSED, file_name,
                 preprocessed_data)
def generate_frequent_words(filename_read_S3, filename_wite_S3):
    """
    Generate a list of most frequent words for each subreddit.

    :param filename_read_S3: "S3 file location to be read"
    :param filename_wite_S3: "S3 file to write to"
    :return: None
    """
    # get data -
    print("Step 1: read cleaned file into Dataframe from S3")
    comments_df1 = sqlContext.read.parquet(filename_read_S3)

    # select limited columns
    comments_df2 = comments_df1.select('subreddit', 'subreddit_id', 'year',
                                       'month', 'body_without_stopwords')
    print("schema of dataset - {0}".format(comments_df2.printSchema()))

    # -------------------------
    # WORD Count
    # -------------------------
    print("Step  3: Apply punctuation to a body_without_stopwords")
    comments_df3 = comments_df2.select(
        'subreddit', 'subreddit_id', 'year', 'month',
        removePunctuation(col('body_without_stopwords')))

    print("Step 3.1: Apply word lemmatization to generate base words")
    # register UDF
    spark.udf.register("lemma", lemma, ArrayType(StringType()))
    lemma_udf = udf(lemma)
    # run transformation
    comments_df31 = comments_df3.withColumn("lemmatized_body",
                                            lemma_udf(col("cleaned_body")))
    print(comments_df31.printSchema())

    # remove punctuations again
    comments_df32 = comments_df31.select(
        'subreddit', 'subreddit_id', 'year', 'month',
        removePunctuation(col('lemmatized_body')))

    print("Step  4: Split lines to words to generate the count")
    comments_df4 = (comments_df31.select(
        explode(split(comments_df31.cleaned_body, ' ')).alias('word'),
        'subreddit', 'subreddit_id', 'year', 'month').where(col('word') != ''))

    print(comments_df4.printSchema())

    print("Step  5: Get WordCount")
    comments_df5 = wordCount(comments_df4).orderBy("count", ascending=False)

    print("Step  6: Get ranking of each word based on count by windo")
    window = Window.partitionBy(comments_df5['subreddit_id']).orderBy(
        comments_df5['count'].desc())

    print("Step 7: Get words with rankin > 5")
    comments_df6 = comments_df5.select(
        '*',
        rank().over(window).alias('rank')).filter(col('rank') <= 5)

    print("writing data to S3")
    # ----------------------
    # Store to S3 - as Parquet
    # ----------------------
    print("Step 8: Generate parquet file for the words and load to S3")
    comments_df6.write.parquet(filename_wite_S3)
    print("Completed writing data to S3")
    return
	return [wordnet_lemmatizer.lemmatize(word,pos="v") for word in line]

text = removedsw.withColumn("lemma",lemma(removedsw.filtered_words))



def unitoarr(line):
	s = []
	for w in line:
		w = w.strip()
		if len(w) != 1 and w != "" and w != ' ' and len(w)>2:
			s.append(w) 
	return s


unitoarr_udf =  udf(unitoarr, ArrayType(StringType()))

text2 = text.withColumn("review",unitoarr_udf(text.lemma)).withColumn("label",change_labels(text.stars))


ngram = NGram(n=2, inputCol="review", outputCol="ngrams")

ngramDataFrame = ngram.transform(text2)

cv = CountVectorizer(inputCol="ngrams", outputCol="features")

models = cv.fit(ngramDataFrame)

result = models.transform(ngramDataFrame)

result1 = result.select("business_id","text","stars","label","features","ngrams")
Beispiel #6
0
    "boolean": BooleanType,
    "struct": StructType,
    "array": ArrayType,
    "bigint": LongType,
    "date": DateType,
    "byte": ByteType,
    "short": ShortType,
    "datetime": TimestampType,
    "binary": BinaryType,
    "null": NullType,
    "vector": VectorUDT
}

SPARK_DTYPES_DICT_OBJECTS = \
    {"string": StringType(), "int": IntegerType(), "float": FloatType(),
     "double": DoubleType(), "boolean": BooleanType(), "struct": StructType(), "array": ArrayType(StringType()),
     "bigint": LongType(), "date": DateType(), "byte": ByteType(), "short": ShortType(),
     "datetime": TimestampType(), "binary": BinaryType(), "null": NullType()
     }

# Profiler
PROFILER_TYPES = {
    "int", "float", "string", "bool", "date", "null", "array", "double"
}
PROFILER_LEGEND_TYPES = {
    "string": "ABC",
    "int": "#",
    "integer": "#",
    "float": "##.#",
    "double": "##.#",
    "bigint": "#"
# save model run to mlflow
with mlflow.start_run(run_name='deployment run') as run:
    mlflow.pyfunc.log_model('model',
                            python_model=_lifetimesModelWrapper(model),
                            conda_env=conda_env)

# COMMAND ----------

# MAGIC %md Now that our model along with its dependency information and class wrapper have been recorded, let's use mlflow to convert the model into a function we can employ against a Spark DataFrame:

# COMMAND ----------

from pyspark.sql.types import ArrayType, FloatType

# define the schema of the values returned by the function
result_schema = ArrayType(FloatType())

# define function based on mlflow recorded model
probability_alive_udf = mlflow.pyfunc.spark_udf(spark,
                                                'runs:/{0}/model'.format(
                                                    run.info.run_id),
                                                result_type=result_schema)

# register the function for use in SQL
_ = spark.udf.register('probability_alive', probability_alive_udf)

# COMMAND ----------

# MAGIC %md Assuming we had access to customer metrics for frequency, recency and age, we can now use our function to generate some predictions:

# COMMAND ----------
Beispiel #8
0
from pyspark.ml import PipelineModel
from pyspark.ml.linalg import Vectors, VectorUDT

# Command to run this application:
# spark-submit --packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.1.1,org.elasticsearch:elasticsearch-spark-30_2.12:7.12.1 --master local[*] app.py
logging.basicConfig(format='%(asctime)s - %(levelname)s - %(message)s',
                    level=logging.ERROR)

elastic_host = "elasticsearch"
elastic_index = "matches"
kafkaServer = "kafkaserver:9092"
topic = "dota_lineup"

# Schema of the input data
schema = StructType([
    StructField("dire_lineup", ArrayType(IntegerType(), False), False),
    StructField("radiant_lineup", ArrayType(IntegerType(), False), False),
    StructField("radiant_win", BooleanType(), False),
    StructField("match_seq_num", LongType(), False)
])

# Spark configuration, mainly needed for the elasticsearch plugin
sparkConf = SparkConf().set("spark.app.name", "dotingestion2") \
                        .set("es.nodes", "elasticsearch") \
                        .set("es.port", "9200") \
                        .set("es.mapping.id", "match_seq_num") \
                        .set("es.write.operation", "upsert")

# Load the hero_id conversions
with open("heroes.json", 'r', encoding="utf-8") as f:
    heroes_dict = {hero['id']: i for i, hero in enumerate(loads(f.read()))}
Beispiel #9
0
def convert_types_for_es(df: DataFrame) -> DataFrame:
    to_array = udf(lambda v: v.toArray().tolist(), ArrayType(FloatType()))

    return df.withColumn("radiant_win_prediction", df.prediction.cast(BooleanType())) \
             .withColumn("probability_arr", to_array(df.probability))
Beispiel #10
0
# polaczenie z brokerem
v_broker = "ec2-34-236-190-208.compute-1.amazonaws.com:9092"
v_ckpt_loc = "/tmp/checkpoint"

spark = SparkSession.builder.appName("Structured").getOrCreate()
# odczyt strumienia z tematu
raw=spark.readStream.format("kafka")\
.option("kafka.bootstrap.servers",v_broker)\
.option("startingOffsets", "earliest")\
.option("subscribe","sensor").load()

# Schemat napływających danych
schema = StructType()\
.add("current", StructType()\
.add("fromDateTime", StringType())\
.add("indexes", ArrayType(StructType().add("advice",StringType()).add("color",StringType()).add("description",StringType()).add("description",StringType()).add("name",StringType()).add("value",DoubleType())))\
.add("standards", ArrayType(StructType().add("averaging",StringType()).add("limit",StringType()).add("name",StringType()).add("percent",DoubleType()).add("pollutant",StringType())))\
.add("tillDateTime", StringType())\
.add("values", ArrayType(StructType().add("name",StringType()).add("value",StringType())))
      )\
.add("forecast", ArrayType(StructType().add("fromDateTime",StringType())\
.add("indexes",ArrayType(StructType().add("advice",StringType()).add("advice",StringType()).add("color",StringType()).add("description",StringType()).add("value",DoubleType())) ) \
.add("standarts",ArrayType(StructType().add("averaging",StringType()).add("limit",StringType()).add("name",StringType()).add("percent",DoubleType()).add("pollutant",StringType()))) \
.add("tillDateTime", StringType())\
.add("values", ArrayType(StructType().add("name",StringType()).add("value",StringType())))\
))\
.add("history", ArrayType(StructType().add("fromDateTime",StringType())\
.add("indexes",ArrayType(StructType().add("advice",StringType()).add("advice",StringType()).add("color",StringType()).add("description",StringType()).add("value",DoubleType())) ) \
.add("standarts",ArrayType(StructType().add("averaging",StringType()).add("limit",StringType()).add("name",StringType()).add("percent",DoubleType()).add("pollutant",StringType()))) \
.add("tillDateTime", StringType())\
.add("values", ArrayType(StructType().add("name",StringType()).add("value",StringType())))\
Beispiel #11
0
    studentMarks2 = [
        Row(1, Row("john", "doe"), 6, [70.0, 35.0, 85.0]),
        Row(2, Row("jane", "doe"), 9, [80.0, 35.0, 92.5, 35.0, 46.0])
    ]
    
    studentMarks2Rdd = spark.sparkContext.parallelize(studentMarks2, 4)

    schema2 = StructType()\
        .add("id", IntegerType(), nullable=True)\
        .add("name", StructType()\
             .add("first", StringType(), nullable=True)\
             .add("last", StringType(), nullable=True)
             , nullable=True)\
        .add("standard", IntegerType(), True)\
        .add("marks", ArrayType(DoubleType(), containsNull=False), nullable = True)

    studentMarks2DF = spark.createDataFrame(studentMarks2Rdd, schema2)

    print("Schema with array")
    studentMarks2DF.printSchema()

    print("DataFrame with array")
    studentMarks2DF.show()

    print("Count elements of each array in the column")
    studentMarks2DF.select("id", F.size("marks").alias("count")).show()

    print("Explode the array elements out into additional rows")
    studentMarks2DF.select("id", F.explode("marks").alias("scores")).show()
Beispiel #12
0
        # weeks.append(long(date_item.strftime("%Y%W")))
        text_week_time = date_item + timedelta(7)
        week_id = long(date_item.strftime("%Y%W"))
        next_week_id = long(text_week_time.strftime("%Y%W"))
        week_and_next_week = Row("week_id", "next_week_id")(week_id,
                                                            next_week_id)
        weeks.append(week_and_next_week)
        date_item += timedelta(7)

    if len(weeks) == 0:
        weeks = [Row("week_id", "next_week_id")(week_fake, week_fake)]

    return weeks


get_weeks = f.udf(get_weeks, ArrayType(TimeStructType))

# def get_week_timestamp_from_week_id(week_id):
#     return 0L
#
# get_week_timestamp_from_week_id = f.udf(get_week_timestamp_from_week_id, LongType())


def get_df_student_package(glueContext):
    dyf_student_package = glueContext.create_dynamic_frame.from_options(
        connection_type="redshift",
        connection_options={
            "url":
            REDSHIFT_DATABASE,
            "user":
            REDSHIFT_USERNAME,
from pyspark.sql.functions import from_json, to_json, col, unbase64, base64, split, expr
from pyspark.sql.types import StructField, StructType, StringType, BooleanType, ArrayType, DateType, FloatType

# TO-DO: create a StructType for the Kafka redis-server topic which has all changes made to Redis - before Spark 3.0.0, schema inference is not automatic
redisMessageSchema = StructType(
    [
        StructField("key", StringType()),
        StructField("value", StringType()),
        StructField("expiredType", StringType()),
        StructField("expiredValue",StringType()),
        StructField("existType", StringType()),
        StructField("ch", StringType()),
        StructField("incr",BooleanType()),
        StructField("zSetEntries", ArrayType( \
            StructType([
                StructField("element", StringType()),\
                StructField("score", StringType())   \
            ]))                                      \
        )

    ]
)

# TO-DO: create a StructType for the Customer JSON that comes from Redis- before Spark 3.0.0, schema inference is not automatic

customerJSONSchema = StructType([
    StructField("customerName", StringType()),
    StructField("email", StringType()),
    StructField("phone", StringType()),
    StructField("birthDay", StringType())
])
Beispiel #14
0
def generate_column_names(initial, intermediate_count, final):
    columns = ["__col_{:02d}".format(idx) for idx in range(intermediate_count)]
    columns.insert(0, initial)
    columns.append(final)
    return columns


if __name__ == "__main__":
    sc = pyspark.SparkContext('local[*]', 'PipelineFlow')
    sess = pyspark.sql.SparkSession(sc)
    rdd = sc.wholeTextFiles('data/*')
    rdd = rdd.map(lambda x: (x[0], json.loads(x[1])))
    print(type(rdd.take(1)[0][1][0]))
    schema = StructType([
        StructField('file', StringType(), True),
        StructField('content', ArrayType(MapType(StringType(), StringType())),
                    True)
    ])
    df = sess.createDataFrame(rdd, schema)

    trans_manager = TransformerModuleManager("modules")
    print("Available transformers' names: {}".format(", ".join(
        trans_manager.loaded_transformers_names)))

    loaded_transformers = trans_manager.loaded_transformers
    col_names = generate_column_names("content",
                                      len(loaded_transformers) - 1,
                                      "sentences")

    stages = list(
        map(
Beispiel #15
0
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName('unit3').getOrCreate()

tokensDF = spark.read.json(
    "/user/cs4984cs5984f18_team4/4_Attack_Westminster_big/Attack_Westminster_big_tokenized.json"
)

import nltk
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType
from collections import Counter
from nltk.corpus import stopwords
import string

# [1] Tag tokens with POS
POSTagUDF = udf(lambda x: nltk.pos_tag(x), ArrayType(ArrayType(StringType())))
posRDD = tokensDF.rdd.flatMap(
    lambda x: nltk.pos_tag(x.tokens_with_stopwords)).map(lambda x: (x[0].lower(
    ), x[1])).filter(lambda x: x[0] not in stop_words)

custom_stopwords = [
    "``", "''", "'s", "said", "could", "also", "news", "--", "..."
]
stop_words = set(
    stopwords.words('english') + list(string.punctuation) + custom_stopwords)

# [2] Get most frequent nouns
counter = Counter(
    posRDD.filter(lambda x: x[1][0] == 'N').map(lambda x: x[0]).collect())
countDF = spark.createDataFrame(counter.most_common(100), ['noun', 'count'])
countDF.write.csv(
Beispiel #16
0
    def hist_date(df, col_name):
        """
        Create a histogram for a date type column
        :param df: Dataframe to be analyzed
        :param col_name: Dataframe column to be analyzed
        :return:
        """
        col_info = {}

        # Create year/month/week day/hour/minute

        def func_infer_date(value, args):
            if value is None:
                result = [None]
            else:
                date = dateutil.parser.parse(value)
                result = [
                    date.year, date.month,
                    date.weekday(), date.hour, date.minute
                ]
            return result

        df = (df.cols.select(col_name).cols.apply(
            col_name, func_infer_date, ArrayType(
                LongType())).cols.unnest(col_name).h_repartition().cache())

        for i in range(5):
            key_name = ""
            temp_col = col_name + "_" + str(i)
            # Years
            if i == 0:
                buckets_date = 100
                key_name = "years"

                min_value = df.cols.min(temp_col)
                max_value = df.cols.max(temp_col)

            # Months
            elif i == 1:
                buckets_date = 12
                min_value = 0
                max_value = 12
                key_name = "months"

            # Weekdays
            elif i == 2:
                buckets_date = 7
                min_value = 0
                max_value = 7
                key_name = "weekdays"

            # Hours
            elif i == 3:
                buckets_date = 24
                min_value = 0
                max_value = 24
                key_name = "hours"

            # Minutes
            elif i == 4:
                buckets_date = 60
                min_value = 0
                max_value = 60
                key_name = "minutes"

            col_info[key_name] = df.cols.hist(temp_col, min_value, max_value,
                                              buckets_date)

        return col_info
Beispiel #17
0
    in_df.coupon.cast(DecimalType(10, 5)),
    in_df['yield'].cast(DecimalType(10, 5)), in_df.type,
    in_df.duration.cast(IntegerType()))

in_cast_df.show()

periodic_value_schema = StructType([
    StructField("period", IntegerType(), True),
    StructField("cp", DecimalType(12, 5), True),
    StructField("pv", DecimalType(12, 5), True),
    StructField("aggpv", DecimalType(12, 5), True),
    StructField("quote", DecimalType(12, 5), True)
])

udf_calc_periodic_value = udf(calc_periodic_value,
                              ArrayType(periodic_value_schema))

in_cast_df.withColumn("periodic_value", udf_calc_periodic_value(in_cast_df["value"], in_cast_df["coupon"],
                                                                in_cast_df["yield"], in_cast_df["type"],
                                                                in_cast_df["duration"]))\
    .withColumn("periodic_value", explode("periodic_value"))\
    .select("id", "value", "periodic_value.period", "periodic_value.cp", "periodic_value.pv", "periodic_value.aggpv",
            "periodic_value.quote")\
    .createOrReplaceTempView("periodic_value_table")

spark.sql(
    "select id as `Bond ID`, period as `Period`, cp as `Coupon payment`, pv as `PV of periodic payments`, "
    "aggpv as A from periodic_value_table").show(50)

spark.sql(
    "select id as `Bond ID`, aggpv as A, value as `FV`, quote as `Quote` "
Beispiel #18
0
total_counts = rawFeatures.rdd.map(lambda row: row['rawFeatures'].toArray(
)).reduce(lambda x, y: [x[i] + y[i] for i in range(len(y))])

vectorizerModel = model.stages[1]
vocabList = vectorizerModel.vocabulary
d = {'vocabList': vocabList, 'counts': total_counts}
spark.createDataFrame(np.array(list(d.values())).T.tolist(),
                      list(d.keys())).show()

from pyspark.sql.functions import udf
import pyspark.sql.functions as F
from pyspark.sql.types import DoubleType, IntegerType
from pyspark.sql.types import ArrayType, StringType

indices_udf = udf(lambda vector: vector.indices.tolist(),
                  ArrayType(IntegerType()))
values_udf = udf(lambda vector: vector.toArray().tolist(),
                 ArrayType(DoubleType()))


def termsIdx2Term(vocabulary):
    def termsIdx2Term(termIndices):
        return [vocabulary[int(index)] for index in termIndices]

    return udf(termsIdx2Term, ArrayType(StringType()))


rawFeatures.withColumn('indices', indices_udf(F.col('rawFeatures'))) \
    .withColumn('values', values_udf(F.col('rawFeatures'))) \
    .withColumn("Terms", termsIdx2Term(vocabList)("indices")).show()
Beispiel #19
0
def select_relevant_columns(df,
                            discrete_action: bool = True,
                            include_possible_actions: bool = True):
    """ Select all the relevant columns and perform type conversions. """
    if not discrete_action and include_possible_actions:
        raise NotImplementedError(
            "currently we don't support include_possible_actions")

    select_col_list = [
        col("reward").cast(FloatType()),
        col("state_features").cast(ArrayType(FloatType())),
        col("state_features_presence").cast(ArrayType(BooleanType())),
        col("next_state_features").cast(ArrayType(FloatType())),
        col("next_state_features_presence").cast(ArrayType(BooleanType())),
        col("not_terminal").cast(BooleanType()),
        col("action_probability").cast(FloatType()),
        col("mdp_id").cast(LongType()),
        col("sequence_number").cast(LongType()),
        col("step").cast(LongType()),
        col("time_diff").cast(LongType()),
        col("metrics").cast(ArrayType(FloatType())),
        col("metrics_presence").cast(ArrayType(BooleanType())),
    ]

    if discrete_action:
        select_col_list += [
            col("action").cast(LongType()),
            col("next_action").cast(LongType()),
        ]
    else:
        select_col_list += [
            col("action").cast(ArrayType(FloatType())),
            col("next_action").cast(ArrayType(FloatType())),
            col("action_presence").cast(ArrayType(BooleanType())),
            col("next_action_presence").cast(ArrayType(BooleanType())),
        ]

    if include_possible_actions:
        select_col_list += [
            col("possible_actions_mask").cast(ArrayType(LongType())),
            col("possible_next_actions_mask").cast(ArrayType(LongType())),
        ]

    return df.select(*select_col_list)
Beispiel #20
0
def termsIdx2Term(vocabulary):
    def termsIdx2Term(termIndices):
        return [vocabulary[int(index)] for index in termIndices]

    return udf(termsIdx2Term, ArrayType(StringType()))
Beispiel #21
0
def pyspark_script_console11(inputs, settings):
    data = inputs.get('data', None)
    df = inputs.get('df', None)
    df1 = inputs.get('df1', None)
    df2 = inputs.get('df2', None)
    df3 = inputs.get('df3', None)
    transformer = inputs.get('transformer', None)
    estimator = inputs.get('estimator', None)
    model = inputs.get('model', None)

    import re
    import datetime
    from pyspark.sql.functions import udf
    from pyspark.sql.types import IntegerType, StringType, ArrayType

    def p_ordinalDate(string):
        start = datetime.datetime.strptime(string.strip(), '%d/%m/%Y')
        return start.toordinal()

    def p_time(string):
        hours = int(string.split(":")[0])
        if "PM" in string: hours += 12
        return hours

    def p_entryLocation(string):
        vectors1 = ['PREMISES-REAR', 'PREMISES-FRONT', 'PREMISES-SIDE']
        for x in vectors1:
            if x in string: return x
        return "UNKNOWN"

    def p_entryPoint(string):
        vectors2 = ['POINT OF ENTRY-DOOR', 'POINT OF ENTRY-WINDOW', \
                    'POINT OF ENTRY-FENCE', 'POINT OF ENTRY-DOOR: GARAGE']
        vectors3 = [
            'POE - DOOR', 'POE - WINDOW', 'POE - FENCE', 'POE - GARAGE'
        ]
        for x, y in list(zip(vectors2, vectors3)):
            if x in string or y in string: return x
        return "UNKNOWN"

    def p_dayOfWeek(string):
        start = datetime.datetime.strptime(string, '%d/%m/%Y')
        return start.weekday()

    def p_northingEasting(string, string2):
        return "%s-%s" % (string, string2)

    def p_methodOfEntry(string):
        if string is None:
            return ''

        narrative = string.split(
            "__________________________________ CREATED BY")[-1]
        if 'NARRATIVE' in narrative or 'CIRCUMSTANCES' in narrative:
            narrative = re.split('NARRATIVE|CIRCUMSTANCES', narrative)[-1]
            narrative = re.split("\*|:", narrative[1:])[0]
        return narrative

        # Classifies if the search was messy

    def p_messy(string):
        negations = ["NOT ", "NO ", "HAVEN'T", "DIDN'T", 'DIDNT', "HAVENT"]
        messywords = ['MESSY', 'MESSIL', 'RUMMAG', 'TIPPED']
        sentences = [
            sentence + '.' for sentence in string.split(".")
            if any(word in sentence for word in messywords)
        ]
        c = 0
        for x in sentences:
            if any(word in x for word in negations):
                c -= 1
            else:
                c += 1
        return 1 if c > 0 else 0

    def p_signature(string):
        if "DEFECA" in string:
            return 1
        if "URINAT" in string:
            return 2
        if "MASTURB" in string:
            return 3
        if "GRAFFIT" in string:
            return 4
        return "UNKNOWN"

    def p_propertySecure(string):
        verbs = ['LOCKED', 'FENCED', 'GATED', 'SECURED', 'BOLTED']
        negations = ["NOT ", "NO ", "HAVEN'T", "DIDN'T", 'DIDNT', "HAVENT"]
        c = 0
        sentences = [
            sentence + '.' for sentence in string.split(".")
            if any(word in sentence for word in verbs)
        ]
        for x in sentences:
            if any(word in x for word in negations):
                c -= 1
            else:
                c += 1
        return 1 if c > 0 else 0

    import nltk
    from nltk.parse.stanford import StanfordDependencyParser
    import string as string_module

    stemmer = nltk.stem.porter.PorterStemmer()
    parser = StanfordDependencyParser(
        path_to_models_jar=
        '/Users/Chao/nzpolice/summer/stanford-parser/stanford-parser-3.8.0-models.jar',
        model_path='edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz',
        path_to_jar=
        '/Users/Chao/nzpolice/summer/stanford-parser/stanford-parser.jar',
        java_options='-Xmx1000M',
        verbose=False)
    remove_punctuation_map = dict(
        (ord(char), None) for char in string_module.punctuation)
    unigram_tagger = nltk.tag.UnigramTagger(nltk.corpus.brown.tagged_sents())
    sent_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

    # For vectorizing text
    def stem_tokens(tokens):
        return [stemmer.stem(item) for item in tokens]

        # Normalizes text (i.e, tokenizes and then stems words)

    def normalize(text):
        return stem_tokens(
            nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))

    def p_propertyStolenList(string):
        if "PROPERTY" not in string:
            return []
        property_list = " ".join([
            re.split(':|_', listing)[0] for listing in re.split(
                "PROPERTY LIST SUMMARY:|PROPERTY STOLEN:", string)
        ])
        text = normalize(property_list)
        tagged = unigram_tagger.tag(text)
        removable = [
            'modus', 'operandi', 'call', 'with', 'list', 'of', 'location',
            'point', 'entry', 'value', 'property'
            'police', 'stage', 'name', 'details', 'insured', 'victim',
            'address'
        ]
        o = []
        for x in tagged:
            if (not (x[1] in ["NN", "NNS"])) or (x[0] in removable):
                pass
            else:
                if not len(x[0]) < 3:
                    o.append(x[0])
        return o

    def p_pullMOTags(string):
        sentences = sent_tokenizer.tokenize(string)
        sentences = [sent.lower().capitalize() for sent in sentences]
        x_relations = []
        for sent in sentences:
            if len(sent.split(" ")) > 100: continue
            try:
                parsed = parser.raw_parse(sent)
                triples = [parse.triples() for parse in parsed]
                selected = [
                    triple for triple in triples[0]
                    if (triple[1] in ("dobj", "nsubjpass"))
                ]
            except:
                continue
            for x in selected:
                x_relations.append(x)
        return x_relations

        # def stem_tokens(tokens):

    # 	return [stemmer.stem(item) for item in tokens]
    #
    #
    # # Normalizes text (i.e, tokenizes and then stems words)
    # def normalize(text):
    # 	if text is None:
    # 		return []
    # 	return stem_tokens(nltk.word_tokenize(text.lower().translate(remove_punctuation_map)))

    udf_ordinal_date = udf(p_ordinalDate, IntegerType())
    udf_time = udf(p_time, IntegerType())
    udf_entry_location = udf(p_entryLocation, StringType())
    udf_entry_point = udf(p_entryPoint, StringType())
    udf_day_of_week = udf(p_dayOfWeek, IntegerType())
    udf_northing_easting = udf(p_northingEasting, StringType())
    udf_method_of_entry = udf(p_methodOfEntry, StringType())  # *
    udf_messy = udf(p_messy, IntegerType())
    udf_signature = udf(p_signature, IntegerType())
    udf_property_secure = udf(p_propertySecure, IntegerType())
    udf_property_stolen_list = udf(p_propertyStolenList,
                                   ArrayType(StringType()))
    udf_pull_mo_tags = udf(p_pullMOTags, ArrayType(StringType()))

    # udf_normalize = udf(normalize, ArrayType(StringType()))

    FEATURES_TO_USE = [
        ('ordinalDate', 'Occurrence Start Date', udf_ordinal_date),
        ('time', 'Occurrence Start Time', udf_time),
        ('entryLocation', 'Narrative', udf_entry_location),
        ('entryPoint', 'Narrative', udf_entry_point),
        ('dayOfWeek', 'Occurrence Start Date', udf_day_of_week),
        ('northingEasting', ('NZTM Location Northing',
                             'NZTM Location Easting'), udf_northing_easting),
        ('methodOfEntry', 'Narrative', udf_method_of_entry),
        ('messy', 'methodOfEntry', udf_messy),
        ('signature', 'Narrative', udf_signature),
        ('propertySecure', 'Narrative', udf_property_secure),
        ('propertyStolenWordnet', 'Narrative', udf_property_stolen_list),
        # ('cosineTFIDF', 'Narrative', udf_method_of_entry),
        # ('cosineTFIDF2', 'Narrative', udf_method_of_entry),
        ('cosineMO', 'methodOfEntry', udf_pull_mo_tags),
        # ('propertyStolenWordNetNA', 'Narrative', udf_property_stolen_list),
        # ('listSimilarity', 'Narrative', udf_property_stolen_list),
        # ('moSim', 'methodOfEntry', udf_pull_mo_tags),
    ]

    df = df.na.fill({'Narrative': ''})
    # df.na.drop(subset=["Narrative"])

    for t in FEATURES_TO_USE:
        new_col = t[0]
        func = t[2]
        in_cols = t[1]
        params = (df[c] for c in t[1]) if isinstance(in_cols,
                                                     tuple) else [df[in_cols]]
        df = df.withColumn(new_col, func(*params))

    return {
        'data': data,
        'df': df,
        'df1': df1,
        'df2': df2,
        'df3': df3,
        'transformer': transformer,
        'estimator': estimator,
        'model': model
    }
# MAGIC Let's look at one way to apply the spaCy NLP pipeline to our tweets using SQL and a user defined function (UDF):

# COMMAND ----------

from pyspark.sql.types import ArrayType, FloatType, StringType
import spacy
nlp = spacy.load("en_core_web_sm")


def getVerbs(text):
    doc = nlp(text)
    verbs = [token.lemma_ for token in doc if token.pos_ == "VERB"]
    return verbs


spark.udf.register("getVerbs", getVerbs, ArrayType(StringType()))

# COMMAND ----------

# MAGIC %md
# MAGIC We can now use the UDF in our SQL statements to extract verbs from a tweet:

# COMMAND ----------

# MAGIC %sql
# MAGIC select normalized_text
# MAGIC       ,getVerbs(normalized_text) as `verbs`
# MAGIC from tweets_clean_for_nlp

# COMMAND ----------
Beispiel #23
0
def get_data():
    data = """
[
  {
    "friends": [
      {
        "id": 0,
        "name": "Georgina Sears"
      },
      {
        "id": 1,
        "name": "Miranda Tillman"
      },
      {
        "id": 2,
        "name": "Rosario Doyle"
      }
    ]
  },
  {
    "friends": [
      {
        "id": 0,
        "name": "Manuela Noble"
      },
      {
        "id": 1,
        "name": "Aguilar Roy"
      },
      {
        "id": 2,
        "name": "Holt Espinoza"
      }
    ]
  },
  {
    "friends": [
      {
        "name": "Manuela Noble"
      },
      {
        "name": "Aguilar Roy"
      },
      {
        "id": 2,
        "name": "Holt Espinoza"
      }
    ]
  }
]
    """
    data_dict = json.loads(data)
    print(data_dict)
    schema = StructType().add(
        "friends",
        ArrayType(
            StructType([
                StructField("id", StringType()),
                StructField("name", StringType())
            ])))
    df = spark.createDataFrame(data_dict, schema)
    return df
Beispiel #24
0
def test_exploding_data_frame(spark_session):
    sc = spark_session.sparkContext
    ###Generando string con json format:

    _data_js_string = [
        '{"numero_caja":"3","compras":[[{"cantidad":"2","nombre":"Harina","precio_unitario":"1500"},\
				   		           {"cantidad":"5","nombre":"Arroz","precio_unitario":"1000"}],\
                                                          [{"cantidad":"4","nombre":"Frijoles","precio_unitario":"800"}],\
                                                          [{"cantidad":"7","nombre":"Manzana","precio_unitario":"500"},\
				   		           {"cantidad":"2","nombre":"JugoNaranja","precio_unitario":"1800"},\
                                                           {"cantidad":"6","nombre":"Carbon","precio_unitario":"1500"},\
				   		           {"cantidad":"3","nombre":"Pera","precio_unitario":"400"}]]}\
 				                          '
    ]

    ##Definiendo el schema para que coincida con el desarrollado en la tarea

    schema = StructType([
        StructField(
            "compras",
            ArrayType(
                ArrayType(
                    StructType([
                        StructField("cantidad", StringType()),
                        StructField("nombre", StringType()),
                        StructField("precio_unitario", StringType())
                    ])))),
        StructField("numero_caja", StringType())
    ])
    ##Convirtiendo el json file a dataframe

    _data_js = spark_session.read.schema(schema).json(
        sc.parallelize(_data_js_string))

    ##El metodo a probar requiere el dataframe despues de haberlo convertido de string a json file
    ##El metodo exploding_data_frame realizara 2 explode para as tener cada producto separado del array

    _dato_calculado = exploding_data_frame(_data_js)

    ##Se genera el dataframe esperado con los datos enviados al metodo:

    _dato_esperado = [("3", ["2", "Harina", "1500"]),
                      ("3", ["5", "Arroz", "1000"]),
                      ("3", ["4", "Frijoles", "800"]),
                      ("3", ["7", "Manzana", "500"]),
                      ("3", ["2", "JugoNaranja", "1800"]),
                      ("3", ["6", "Carbon", "1500"]),
                      ("3", ["3", "Pera", "400"])]

    schema = StructType([
        StructField("numero_caja", StringType()),
        StructField(
            "col",
            StructType([
                StructField("cantidad", StringType()),
                StructField("nombre", StringType()),
                StructField("precio_unitario", StringType())
            ]))
    ])

    ##Se obtiene el data frame esperado
    _dato_esperado_df = spark_session.createDataFrame(_dato_esperado, schema)

    _dato_esperado_df.show()
    _dato_calculado.show()
    assert _dato_esperado_df.collect() == _dato_calculado.collect()
Beispiel #25
0
spark = SparkSession.builder.getOrCreate()
# Omit all logs except errors
spark.sparkContext.setLogLevel('ERROR')
# Read each file in cricket folder as a separate record
rdd = spark.sparkContext.wholeTextFiles('/user/root/Final/cricket/')
# Suppress hortonworks path prefix from filename and create
# data frame with 2 columns ('doc' and 'text')
data = rdd.map(lambda x: (x[0].replace('hdfs://sandbox-hdp.hortonworks.com:8020', ''), x[1])).toDF(['doc', 'text'])
# Get total document count
total_docs = data.count()

# utility method for tokenizing a piece of text
def tokenize(text):
    return re.findall('\\w+', text.lower())
# Register the tokenize method as a udf
tokenize_udf = F.udf(tokenize, ArrayType(StringType()))
# tokenize all the text
data = data.select(['doc', tokenize_udf('text').alias('text')])
# make 1 separate row for each token
data_tokens = data.withColumn("token", F.explode('text'))

# calculate term frequency
tf = data_tokens.groupBy('doc', 'token').agg(F.count('text').alias('tf'))
# calculate document frequency
df = data_tokens.groupBy('token').agg(F.countDistinct('doc').alias('df'))

# utility method for calculating inverse document frequency
def inverse_doc_frequency(doc_frequency):
    return math.log((total_docs + 1) * 1.0 / (doc_frequency + 1))

# register inverse document frequency as a udf
Beispiel #26
0
    predictions = model.transform(test)
    evaluator = RegressionEvaluator(metricName="rmse",
                                    labelCol="rating",
                                    predictionCol="prediction")
    rmse = evaluator.evaluate(predictions)
    print("Root-mean-square error = " + str(rmse))

    # Generate top 10 movie recommendations for each user
    userRecs = model.recommendForAllUsers(10)

    # Cast Float to Double (Float is not supported by the Mongo connector)
    userRecs = userRecs.withColumn(
        'recommendations', userRecs['recommendations'].cast(
            ArrayType(
                StructType([
                    StructField('movie_id', IntegerType()),
                    StructField('rating', DoubleType())
                ]))))

    # Write recommendations to the DB
    userRecs.write.format("com.mongodb.spark.sql.DefaultSource").options(
        uri=uri, collection="user_recommendations").mode("overwrite").save()

    # Generate top 10 user recommendations for each movie
    movieRecs = model.recommendForAllItems(10)

    # Generate top 10 movie recommendations for a specified set of users
    users = ratings.select(als.getUserCol()).distinct().limit(3)
    userSubsetRecs = model.recommendForUserSubset(users, 10)
    # Generate top 10 user recommendations for a specified set of movies
    movies = ratings.select(als.getItemCol()).distinct().limit(3)
def build_passage_to_entity_maps(content_path,
                                 spark,
                                 max_rank,
                                 dir_path,
                                 dataset_metadata=dataset_metadata):
    """" """
    df = spark.read.parquet(content_path)
    df.printSchema()

    @udf(returnType=ArrayType(StringType()))
    def get_ents(content_bytearray):
        synthetic_entity_links = document_pb2.DocumentContent().FromString(
            pickle.loads(content_bytearray)).synthetic_entity_links
        entity_links = []
        for synthetic_entity_link in synthetic_entity_links:
            entity_links.append(str(synthetic_entity_link.entity_id))
        return entity_links

    df_entity = df.withColumn("entities", get_ents("content_bytearray"))
    df_entity.printSchema()

    for dataset in ['dev', 'train', 'test']:
        dateset_dir = dir_path + '{}_data/'.format(dataset)
        passage_name = 'passage' + '_{}'.format(dataset)
        passage_path = dataset_metadata[passage_name][0]
        print('================================')
        print('Building passage->entity mappings for {}: {}'.format(
            dataset, passage_path))
        run_dict = {}
        doc_ids_list = []
        with open(passage_path, 'r') as f:
            for line in f:

                query = line.split()[0]
                doc_id = line.split()[2]
                rank = int(line.split()[3])

                if rank <= max_rank:

                    if query not in run_dict:
                        run_dict[query] = []
                    run_dict[query].append(doc_id)
                    doc_ids_list.append(doc_id)

        query_list = sorted(list(run_dict.keys()))

        doc_ids_list = list(set(doc_ids_list))
        print("doc_ids_list len = {}".format(len(doc_ids_list)))
        dataset_df = df_entity[df_entity['content_id'].isin(
            doc_ids_list)].select("content_id", "entities")
        print("dataset_map len = {}".format(dataset_df.count()))
        print(dataset_df.head())

        dataset_dict = {}
        for row in dataset_df.collect():
            dataset_dict[row[0]] = row[1]

        print("dataset_dict len = {}".format(len(dataset_dict)))

        write_json_path = dateset_dir + 'passage_to_entity.json'
        print('writing to: {}'.format(write_json_path))
        with open(write_json_path, 'w') as f:
            json.dump(dataset_dict, f, indent=4)
input_path = sys.argv[1]
output_path = sys.argv[2]
df = spark.read.csv(input_path, header=True, inferSchema=True)
names = df.columns

import pandas as pd
from pyspark.sql.functions import col, pandas_udf, size
from pyspark.sql.types import DoubleType, ArrayType

def predict(*series) -> pd.Series:
    import pandas as pd
    import numpy as np
    from numpy import nan
    from scipy.special._ufuncs import expit
    from scoring_h2oai_experiment_336ccd12_cbb4_11ea_8496_ac1f6b68b7be import Scorer # update with your key
    scorer = Scorer()
    merged = pd.concat(series, axis=1)
    merged.columns = names
    output = scorer.score_batch(merged)
    return pd.Series(output.values.tolist())

    
predict_udf = pandas_udf(predict, returnType=ArrayType(DoubleType()))
columns = [col(name) for name in df.columns]
withPredictions = df.withColumn("prediction", predict_udf(*columns))

# If working with multi-class, can expand prediction, e.g. 3 classes:
num_cols = withPredictions.withColumn("size", size(col("prediction"))).agg({"size": "max"}).head()[0] # To be performant, specify the value, e.g. num_cols=3
withPredictions = withPredictions.select(col("*"), *(col('prediction').getItem(i).alias(f'prediction_{i}') for i in range(num_cols)))
withPredictions = withPredictions.drop(col("prediction"))
Beispiel #29
0
def get_schema(schema_name):
    schema = None
    if schema_name == 'interim_parkingbay_schema':
        schema = StructType([
            StructField('bay_id', IntegerType(), False),
            StructField('last_edit', StringType()),
            StructField('marker_id', StringType()),
            StructField('meter_id', StringType()),
            StructField('rd_seg_id', StringType()),
            StructField('rd_seg_dsc', StringType()),
            StructField(
                'the_geom',
                StructType([
                    StructField(
                        'coordinates',
                        ArrayType(ArrayType(ArrayType(ArrayType(
                            DoubleType()))))),
                    StructField('type', StringType())
                ])),
            StructField('load_id', StringType()),
            StructField('loaded_on', TimestampType())
        ])
    elif schema_name == 'interim_sensor':
        schema = StructType([
            StructField('bay_id', IntegerType(), False),
            StructField('st_marker_id', StringType()),
            StructField('lat', FloatType()),
            StructField('lon', FloatType()),
            StructField(
                'location',
                StructType([
                    StructField('coordinates', ArrayType(DoubleType())),
                    StructField('type', StringType())
                ]), False),
            StructField('status', StringType()),
            StructField('load_id', StringType()),
            StructField('loaded_on', TimestampType())
        ])
    elif schema_name == 'dw_dim_parking_bay':
        schema = StructType([
            StructField('dim_parking_bay_id', StringType(), False),
            StructField('bay_id', IntegerType(), False),
            StructField('marker_id', StringType()),
            StructField('meter_id', StringType()),
            StructField('rd_seg_id', StringType()),
            StructField('rd_seg_dsc', StringType()),
            StructField(
                'the_geom',
                StructType([
                    StructField(
                        'coordinates',
                        ArrayType(ArrayType(ArrayType(ArrayType(
                            DoubleType()))))),
                    StructField('type', StringType())
                ])),
            StructField('load_id', StringType()),
            StructField('loaded_on', TimestampType())
        ])
    elif schema_name == 'dw_dim_location':
        schema = StructType([
            StructField('dim_location_id', StringType(), False),
            StructField(
                'location',
                StructType([
                    StructField('coordinates', ArrayType(DoubleType())),
                    StructField('type', StringType())
                ]), False),
            StructField('lat', FloatType()),
            StructField('lon', FloatType()),
            StructField('load_id', StringType()),
            StructField('loaded_on', TimestampType())
        ])
    elif schema_name == 'dw_dim_st_marker':
        schema = StructType([
            StructField('dim_st_marker_id', StringType(), False),
            StructField('st_marker_id', StringType()),
            StructField('load_id', StringType()),
            StructField('loaded_on', TimestampType())
        ])
    return schema
Beispiel #30
0
from pyspark.sql import Window
import pyspark.sql.functions as F
from pyspark.ml.feature import CountVectorizer
from pyspark.sql.types import FloatType
from pyspark.sql.types import ArrayType, StructType, StructField, IntegerType, StringType, BooleanType
import pyspark.sql.functions as F
import numpy as np
from operator import add
from functools import reduce


@F.udf(
    ArrayType(
        StructType([
            # Adjust types to reflect data types
            StructField("item0", StringType()),
            StructField("item1", IntegerType()),
            StructField("item2", FloatType())
        ])))
def ImpPrice(imp, price):
    imp_rank = range(len(imp))
    price = np.array(price).astype(float).tolist()
    return zip(imp, imp_rank, price)


def getPriceImpressionRank():
    funcs = []
    for col in ["price", 'imp_rank']:
        for func in [F.min, F.max, F.mean, F.stddev]:
            funcs.append(func(col).alias(col + "_" + func.func_name))
    funcs.append(F.count("price").alias('impression_freqs'))