Ejemplo n.º 1
0
# df3.show()
# df4 = df2.withColumn("new", my_udf2(df2.values))
# df4.show()
#
# print(temp1)

#Question 2
data = [Row(101,[[1,2],[3]]),
        Row(102,[[1],[2],[1,2]]),
        Row(103,[[1],[1],[1]]),

      ]

my_schema = StructType([
    StructField("id", IntegerType()),
    StructField("value", ArrayType(ArrayType(IntegerType())))
])

def my_def1(col2):
    # x = dict(Counter(col2))
    # y = [k for k,v in x.items() if v > 1 ]
    print(col2)
    print(type(col2))
    temp = [item for items in col2 for item in items]
    x = dict(Counter(temp))
    print(x)
    y = [k for k, v in x.items() if v > 1]
    print(y)
    return len(y)

my_udf1 = udf(lambda x: my_def1(x), IntegerType())
from pyspark.sql import SparkSession
from pyspark.sql.functions import from_json, to_json, col, unbase64, base64, split, expr
from pyspark.sql.types import StructField, StructType, StringType, FloatType, BooleanType, ArrayType, DateType

# TO-DO: create a StructType for the Kafka redis-server topic which has all changes made to Redis - before Spark 3.0.0, schema inference is not automatic
redisMessageSchema = StructType(
    [
        StructField("key", StringType()),
        StructField("existType", StringType()),
        StructField("Ch", BooleanType()),
        StructField("Incr", BooleanType()),
        StructField("zSetEntries", ArrayType(
            StructType([
                StructField("element", StringType()),
                StructField("Score", StringType())
            ])
        ))
    ]
)

# TO-DO: create a StructType for the Customer JSON that comes from Redis- before Spark 3.0.0, schema inference is not automatic
redisCustomerSchema = StructType(
    [
        StructField("customerName", StringType()),
        StructField("email", StringType()),
        StructField("phone", StringType()),
        StructField("birthDay", StringType()),
    ]
)

# TO-DO: create a StructType for the Kafka stedi-events topic which has the Customer Risk JSON that comes from Redis- before Spark 3.0.0, schema inference is not automatic
# Read input from 5 text sources using a spark data frame
documents = spark.read.text("dataset/*.txt")
documents = documents.withColumn("doc_id",
                                 F.row_number().over(Window.orderBy('value')))
documents.printSchema()

# Toekns identified and extracted from input data
tokenizer = Tokenizer(inputCol="value", outputCol="words")
wordsData = tokenizer.transform(documents)
wordsData.show()

# Stem words derived using the tokens identified
stemmer = SnowballStemmer(language='english')
stemmer_udf = F.udf(lambda tokens: [stemmer.stem(token) for token in tokens],
                    ArrayType(StringType()))
wordsData = wordsData.withColumn("lemms", stemmer_udf("words"))

# Learn a mapping from words to Vectors.
word2Vec = Word2Vec(vectorSize=3,
                    minCount=0,
                    inputCol="lemms",
                    outputCol="result")
model = word2Vec.fit(wordsData)
result = model.transform(wordsData)

# Display synonyms and cosine similarity of words within input data
synonyms = model.findSynonyms(
    "5g", 5)  # its okay for certain words , real bad for others
synonyms.show(5)
Ejemplo n.º 4
0

#For databricks related packages
#./bin/pyspark --packages com.databricks:spark-csv_2.10:1.3.0

#Before Spark 1.4
train = sqlContext.load(source="com.databricks.spark.csv", path = 'PATH/train.csv', header = True,inferSchema = True)
test = sqlContext.load(source="com.databricks.spark.csv", path = 'PATH/test-comb.csv', header = True,inferSchema = True)

#Current Spark 2.1 and ...
from pyspark .sql import SparkSession
spark = SparkSession.builder.master("yarn").getOrCreate()
df = spark.read.csv('hdfs://hadoop-master:9000/index/train.csv',mode="DROPMALFORMED")

#Defining schema with ArrayType
schema = StructType([StructField('array_column', ArrayType(StructType([StructField('element_of_array', StringType(), True)])), True)])

#From local : The third parameter i.e. Boolean Type : True / False denote whether the corresponding filed can be nullable
from pyspark.sql.types import StructType,StructField,LongType,StringType,TimestampType
schema=StructType([StructField('col0', LongType(), True), StructField('col1', LongType(), True), StructField('col2', StringType(), True), StructField('col3', StringType(), True),StructField('col4',TimestampType(),True),StructField('col5',TimestampType(),True),StructField('col6',StringType(),True)])
df = spark.read.csv('file:///index/data_extract_restart2_without_cert/data_refined.csv',mode="DROPMALFORMED"),schema=schema)

#Creating UDF
def dict(sk):
  new_sk=sk.replace(',','|')#replacing comma by pipe in column col2 and putting the result in column named new_column_name
	return new_sk


udf_dict = udf(dict, StringType())

df.withColumn('new_column_name', udf_dict("col2")).write.csv(path="/index/skill_clean_v3")#col2 is the column to be changed
Ejemplo n.º 5
0
def __type_for(field_descriptor):
    get_type = possible_types.get(field_descriptor.type, lambda t: StringType())
    if field_descriptor.label == field_descriptor.LABEL_REPEATED:
        return ArrayType(get_type(field_descriptor))
    return get_type(field_descriptor)
Ejemplo n.º 6
0
    # remove stop words
    remover = StopWordsRemover(inputCol=column, outputCol='remove_stop_words')
    return remover.transform(df).drop(column).withColumnRenamed(
        'remove_stop_words', column)


def remove_not_alphabetic(tokens):
    # isalpha remove words with numbers
    # return [w for w in tokens if w.isalpha()]
    # isalnum allow words and numbers
    return [w for w in tokens if w.isalnum()]


# udf_remove_not_alphabetic = udf(remove_not_alphabetic, ArrayType(StringType()))
udf_remove_not_alphabetic = udf(
    lambda tokens: [w for w in tokens if w.isalnum()], ArrayType(StringType()))


def remove_only_numbers(tokens):
    return [w for w in tokens if not w.isnumeric()]


# udf_remove_only_numbers = udf(remove_only_numbers, ArrayType(StringType()))
udf_remove_only_numbers = udf(
    lambda tokens: [w for w in tokens if not w.isnumeric()],
    ArrayType(StringType()))


def remove_only_spaces(tokens):
    return [w for w in tokens if not w.isspace()]
Ejemplo n.º 7
0
    def to_array(col):
        def to_array_(v):
            return v.toArray().tolist()

        return udf(to_array_, ArrayType(DoubleType()))(col)
Ejemplo n.º 8
0

def get_date_list(start_date):
    if period == PERIOD_WEEKLY:
        end_date = start_date + timedelta(6 - start_date.weekday())
    else:
        end_date = start_date + timedelta(1)
    date_id_list = []
    date_focus = start_date
    while (date_focus < end_date):
        date_id_list.append(int(date_focus.strftime("%Y%m%d")))
        date_focus += timedelta(1)
    return date_id_list


udf_get_date_list = f.udf(get_date_list, ArrayType(LongType()))


def get_df_boom_point_weekly(start_date, df_boom_point):
    end_date = start_date + timedelta(7)
    start_date_id = int(start_date.strftime("%Y%m%d"))
    end_date_id = int(end_date.strftime("%Y%m%d"))

    print('start_date_id: ' + str(start_date_id))
    print('end_date_id: ' + str(end_date_id))

    date_id_list = []
    date_focus = start_date
    while (date_focus < end_date):
        date_id_list.append(int(date_focus.strftime("%Y%m%d")))
        date_focus += timedelta(1)
Ejemplo n.º 9
0
def derive_directories_with_spark(input_dir: str,
                                  output_dir: str,
                                  workflows: dict,
                                  mode='batch',
                                  print_output=False,
                                  spark=None):

    if not spark:
        spark = SparkSession \
            .builder \
            .appName("derive_directories") \
            .getOrCreate()

    if mode == 'stream':
        # infer schema from existing file
        tiny_loc = os.path.dirname(os.path.realpath(
            __file__)) + "/../../../data/examples/panoptes_raw.txt"
        assert os.path.exists(tiny_loc)
        schema = spark.read.json(tiny_loc).schema
        logging.warning(
            'Attempting to stream derived files to {}'.format(input_dir))
        df = spark.readStream.json(input_dir, schema=schema)
    else:
        df = spark.read.json(input_dir)

    df = df.filter(df['links']['workflow'].isin(list(
        workflows.keys())))  # include only allowed workflows

    df = clarify_workflow_version(df)
    df = rename_metadata_like_exports(df)

    workflows_str = json.dumps(workflows)

    def match_and_insert_workflow(annotations, workflow_id, major_version,
                                  minor_version, workflows_str):
        workflow_versions = pd.DataFrame(
            data=json.loads(workflows_str)[workflow_id])
        workflow = find_matching_version(major_version, minor_version,
                                         workflow_versions)
        annotations_dict = json.loads(annotations)
        updated_annotations_dict = insert_workflow_contents(
            annotations_dict, workflow)
        return json.dumps(updated_annotations_dict)

    match_and_insert_workflow_udf = udf(
        lambda a, b, c, d: match_and_insert_workflow(a, b, c, d, workflows_str
                                                     ), StringType())

    # apparently can't pass struct as udf argument, need to use as string
    df = df.withColumn('annotations', to_json(df['annotations']))
    df = df.withColumn(
        'annotations',
        match_and_insert_workflow_udf(
            df['annotations'],
            df['workflow_id'],  # added by rename_metadata_like_exports
            df['workflow_major_version'],  # added by clarify_workflow_version
            df['workflow_minor_version']  # added by clarify_workflow_version
        ))

    # parse annotations back out again (using the new schema, of course)
    annotations_schema = ArrayType(
        StructType([
            StructField("task", StringType(), False),
            StructField("value", StringType(), False),
            StructField("task_label", StringType(), False),
            StructField("task_id", StringType(), False),
            StructField("value_index", StringType(), False),
            StructField("multiple_choice", BooleanType(), False)
        ]))
    df = df.withColumn('annotations',
                       from_json(df['annotations'], schema=annotations_schema))

    if mode == 'stream':
        query = df.writeStream \
            .outputMode('append') \
            .option('checkpointLocation', os.path.join(output_dir, 'checkpoints')) \
            .trigger(processingTime='3 seconds') \
            .start(path=output_dir, format='json')
        print('Derived data ready to stream')
        if print_output:
            while True:
                time.sleep(0.1)
                if query.status['isDataAvailable']:
                    print(datetime.now(), query.status['message'])
    else:
        df.show()
        df.write.save(output_dir, format='json', mode='overwrite')
Ejemplo n.º 10
0
def task4_generate_grams_udf(context):
    # TASK 4
    # Code for task 4...
    context.udf.register("sanitize", sanitize, ArrayType(StringType()))
Ejemplo n.º 11
0
df.printSchema()


def extract_cat(text):
    all_groups = re.findall(r'(\[\[Category:)(\w+)(\]\])', text)
    words = [group[1] for group in all_groups]
    return words


def extract_links(text):
    all_groups = re.findall(r'(\[\[)(\w+)(\]\])', text)
    words = [group[1] for group in all_groups]
    return words


extract_cat_udf = udf(extract_cat, ArrayType(StringType()))
extract_links_udf = udf(extract_links, ArrayType(StringType()))

##Select sample for analyze
df = df.limit(1000)

df = df.withColumn('categories',extract_cat_udf('revision.text'))\
    .withColumn('page_links',extract_links_udf('revision.text'))\
    .withColumn('last_modify_date',to_timestamp('revision.timestamp',TIME_FORMAT))

df_pages = df.select('id', 'title', 'last_modify_date', 'categories',
                     'page_links')
df_pages.show()

df_categories = df.selectExpr('id', 'title', 'explode(categories) as category',
                              'last_modify_date')
Ejemplo n.º 12
0
def task5_combine_trigrams_subreddits(labeled_comments):
    # TASK 5
    # Code for task 5...
    sanitize_udf = udf(sanitize, ArrayType(StringType()))
    return labeled_comments.select('*', sanitize_udf('body').alias('ngrams'))
    def test_supported_types(self):

        values = [
            1, 2, 3, 4, 5, 1.1, 2.2,
            Decimal(1.123), [1, 2, 2], True, 'hello',
            bytearray([0x01, 0x02])
        ]
        output_fields = [('id', IntegerType()), ('byte', ByteType()),
                         ('short', ShortType()), ('int', IntegerType()),
                         ('long', LongType()), ('float', FloatType()),
                         ('double', DoubleType()),
                         ('decim', DecimalType(10, 3)),
                         ('array', ArrayType(IntegerType())),
                         ('bool', BooleanType()), ('str', StringType()),
                         ('bin', BinaryType())]

        output_schema = StructType([StructField(*x) for x in output_fields])
        df = self.spark.createDataFrame([values], schema=output_schema)

        # Different forms of group map pandas UDF, results of these are the same
        udf1 = pandas_udf(
            lambda pdf: pdf.assign(byte=pdf.byte * 2,
                                   short=pdf.short * 2,
                                   int=pdf.int * 2,
                                   long=pdf.long * 2,
                                   float=pdf.float * 2,
                                   double=pdf.double * 2,
                                   decim=pdf.decim * 2,
                                   bool=False if pdf.bool else True,
                                   str=pdf.str + 'there',
                                   array=pdf.array,
                                   bin=pdf.bin), output_schema,
            PandasUDFType.GROUPED_MAP)

        udf2 = pandas_udf(
            lambda _, pdf: pdf.assign(byte=pdf.byte * 2,
                                      short=pdf.short * 2,
                                      int=pdf.int * 2,
                                      long=pdf.long * 2,
                                      float=pdf.float * 2,
                                      double=pdf.double * 2,
                                      decim=pdf.decim * 2,
                                      bool=False if pdf.bool else True,
                                      str=pdf.str + 'there',
                                      array=pdf.array,
                                      bin=pdf.bin), output_schema,
            PandasUDFType.GROUPED_MAP)

        udf3 = pandas_udf(
            lambda key, pdf: pdf.assign(id=key[0],
                                        byte=pdf.byte * 2,
                                        short=pdf.short * 2,
                                        int=pdf.int * 2,
                                        long=pdf.long * 2,
                                        float=pdf.float * 2,
                                        double=pdf.double * 2,
                                        decim=pdf.decim * 2,
                                        bool=False if pdf.bool else True,
                                        str=pdf.str + 'there',
                                        array=pdf.array,
                                        bin=pdf.bin), output_schema,
            PandasUDFType.GROUPED_MAP)

        result1 = df.groupby('id').apply(udf1).sort('id').toPandas()
        expected1 = df.toPandas().groupby('id').apply(
            udf1.func).reset_index(drop=True)

        result2 = df.groupby('id').apply(udf2).sort('id').toPandas()
        expected2 = expected1

        result3 = df.groupby('id').apply(udf3).sort('id').toPandas()
        expected3 = expected1

        assert_frame_equal(expected1, result1)
        assert_frame_equal(expected2, result2)
        assert_frame_equal(expected3, result3)
interested_frame = sql_context.sql(
    "SELECT sa2_name16 AS suburb_name , sa2_main16 AS suburb_code, geometry.coordinates AS coordinates FROM raw"
)

# group and aggregate table by suburb
interested_frame.registerTempTable('interested_table')
gr = sql_context.sql(
    "SELECT suburb_name, FIRST(suburb_code) AS suburb_code, collect_list(coordinates) AS multipolygon "
    "FROM interested_table GROUP BY suburb_name")

# calculate suburb area
user_define_function = func.udf(calculate_multipolygon_area, FloatType())
df1 = gr.withColumn('suburb_area', user_define_function('multipolygon'))

multipolygons_bounds_udf = func.udf(calculate_multipolygon_bounds,
                                    ArrayType(ArrayType(FloatType())))

# Dataframe of melbourne sa2 suburb
df1 = df1.withColumn('suburb_bound', multipolygons_bounds_udf('multipolygon'))

# load forest data
schema = StructType([
    StructField('area', FloatType(), nullable=False),
    StructField('polygon', StringType(), nullable=False)
])
df2 = sql_context.read.format("com.databricks.spark.csv").option("header", "false").option('delimiter', ' '). \
    load(os.path.join(root, "melb_urban_forest_2016.txt/part-*"), schema=schema)

# transform geometry string to formatted string
df2 = df2.withColumn(
    'polygon_formatted',
        classifiers_dict.get("ORGANIZATION", {}))
    ordered_dict["person_ss"] = list(classifiers_dict.get("PERSON", {}))
    ordered_dict["id"] = row["id"]
    return ordered_dict


read_opts = {
    "collection": "cnn",
    "fields": "id, body_t",
    "max_rows": "10",
    "query": "body_t:[* TO *]"
}
df = spark.read.format("solr").options(**read_opts).load()

cnn_rdd = df.rdd.map(lambda r: Row(**classify_text(r.asDict())))

schema = StructType([
    StructField("location_ss", ArrayType(StringType())),
    StructField("organization_ss", ArrayType(StringType())),
    StructField("person_ss", ArrayType(StringType())),
    StructField("id", StringType())
])

newdf = spark.createDataFrame(cnn_rdd, schema)
# newdf.cache

spark.sparkContext._jvm.com.lucidworks.spark.util.DatasetLoader.sendAtomicAddsToSolr(
    newdf._jdf, "cnn", "id", "localhost:9983/lwfusion/4.0.0-SNAPSHOT/solr")

spark.stop()
Ejemplo n.º 16
0
query = "select count_array,day,uckey from factdata where day in ('2020-05-15','2020-05-14','2020-05-13','2020-05-12','2020-05-11','2020-05-10','2020-05-09')"
sc = SparkContext()
hive_context = HiveContext(sc)

df = hive_context.sql(query)
df = add_count_map(df)

df = df.select('uckey', 'day',
               explode(df.count_map)).withColumnRenamed("value", "impr_count")

df = df.withColumn('impr_count',
                   udf(lambda x: int(x), IntegerType())(df.impr_count))
df = df.groupBy('uckey', 'day').sum('impr_count').withColumnRenamed(
    "sum(impr_count)", 'impr_count')

split_uckey_udf = udf(lambda x: x.split(","), ArrayType(StringType()))
df = df.withColumn('col', split_uckey_udf(df.uckey))
df = df.select('uckey', 'impr_count', 'day',
               df.col[1]).withColumnRenamed("col[1]", 'slot_id')

df_slot = df.select('slot_id', 'impr_count', 'day')
df_slot = df_slot.groupBy('slot_id',
                          'day').sum('impr_count').withColumnRenamed(
                              "sum(impr_count)", "impr_total")
bc_df_slot = broadcast(df_slot)

df_new = df.join(bc_df_slot, on=["slot_id", 'day'], how="inner")

df_new = df_new.withColumn(
    'percent',
    udf(lambda x, y: (x * 100) / y, FloatType())(df_new.impr_count,
Ejemplo n.º 17
0
            'py': self.py,
            'pz': self.pz,
            'ke': to_electron_volt(self.ke)
        }


Model = NewType('Model', Callable[[Hit], Optional[AnalyzedHit]])
Analyzer = NewType('Analyzer', Callable[[Hit], Mapping[str,
                                                       Optional[AnalyzedHit]]])

if pyspark_exists:
    from pyspark.sql.types import StructType, StructField, DoubleType, IntegerType, ArrayType, MapType, StringType

    SpkAnalyzedHit = StructType([
        StructField('px', DoubleType(), nullable=False),
        StructField('py', DoubleType(), nullable=False),
        StructField('pz', DoubleType(), nullable=False),
        StructField('ke', DoubleType(), nullable=False),
    ])
    SpkHit = StructType([
        StructField('t', DoubleType(), nullable=False),
        StructField('x', DoubleType(), nullable=False),
        StructField('y', DoubleType(), nullable=False),
        StructField('flag', IntegerType(), nullable=True),
        StructField('as', MapType(StringType(), SpkAnalyzedHit),
                    nullable=True),
    ])
    SpkHits = ArrayType(SpkHit)
else:
    SpkAnalyzedHit, SpkHit, SpkHits = None, None, None
def convert_extract_to_parquet(extract_loc, save_dir, spark=None):

    if not spark:
        spark = SparkSession \
        .builder \
        .appName("shared") \
        .getOrCreate()

    # https://spark.apache.org/docs/latest/api/python/pyspark.sql.html?highlight=read%20csv
    # ignore (escape) " if already within quotes to avoid splitting by , within the columnwise jsons

    # NullPointerException if you try to access something you promised would never be null

    # cannot read nested structs straight from CSV, sadly, so will parse columns individually
    metadata_struct = StructType([
        StructField('source', StringType(), False),
        StructField('session', StringType(), False),
        StructField(
            'viewport',
            StructType([
                StructField('width', StringType(), False),
                StructField('height', StringType(), False)
            ]), False),
        StructField('started_at', TimestampType(), False),
        StructField('user_agent', StringType(), False),
        StructField('utc_offset', StringType(), False),
        StructField('finished_at', TimestampType(), False),
        StructField('live_project', BooleanType(), False),
        StructField('interventions', StringType(), False),  # actually struct
        StructField('user_language', StringType(), False),
        StructField('source', StringType(), False),
        StructField('subject_dimensions', StringType(),
                    False),  # actually struct
        StructField('subject_selection_state', StringType(),
                    False),  # actually struct
        StructField('workflow_translation_id', StringType(),
                    True),  # actually struct, sometimes null
    ])

    # TODO answer (at the very least) is very occasionally null, and this causes either EOF/Null pointer (if not nullable) or raise error like
    # ValueError: Answer None of type <class 'NoneType'> not found in schema for question T0
    # should filter out tasks with missing keys for these
    annotations_struct = ArrayType(
        StructType([
            StructField('task', StringType(), True),
            StructField('task_id', StringType(), True),
            StructField('task_label', StringType(), True),
            StructField('value', StringType(), True),
            StructField('multiple_choice', BooleanType(), True),
        ]))

    # subject_data_internal_struct = StructType(
    #     # StructField('!iauname', StringType(), True),
    #     # StructField('iauname', StringType(), True)
    # )
    # subject_data_struct = ArrayType(MapType(StringType(), subject_data_internal_struct))

    schema = StructType([
        StructField('classification_id', StringType(), False),
        StructField('user_name', StringType(), True),
        StructField('user_id', StringType(), True),
        StructField('user_ip', StringType(), True),
        StructField('workflow_id', StringType(), False),
        StructField('workflow_name', StringType(), False),
        StructField('workflow_version', FloatType(), False),
        StructField('created_at', StringType(), False),
        StructField('gold_standard', StringType(), False),
        StructField('expert', StringType(), False),
        StructField('metadata', StringType(), False),
        StructField('annotations', StringType(), False),
        StructField('subject_data', StringType(), False),
        StructField('subject_ids', StringType(), False)
    ])

    # schema = StructType([
    #     StructField('name', StructType([
    #          StructField('firstname', StringType(), True),
    #          StructField('middlename', StringType(), True),
    #          StructField('lastname', StringType(), True)
    #          ])),
    #      StructField('id', StringType(), True),
    #      StructField('gender', StringType(), True),
    #      StructField('salary', IntegerType(), True)
    #      ])

    ds = spark.read.csv(extract_loc,
                        header=True,
                        quote='"',
                        escape='"',
                        schema=schema,
                        mode='FAILFAST')

    # for debugging
    # ds = ds.sample(withReplacement=False, fraction=.1, seed=42)
    # print(ds.head())

    # need to unpack metadata and subject data
    # print(ds.head()['metadata'])
    # print(ds.head()['annotations'])

    metadata_str_to_struct_udf = udf(metadata_str_to_struct,
                                     returnType=metadata_struct)
    annotations_str_to_struct_udf = udf(annotation_to_struct,
                                        returnType=annotations_struct)
    subject_data_str_to_iauname_udf = udf(subject_data_str_to_iauname,
                                          returnType=StringType())
    get_person_id_udf = udf(get_person_id, returnType=StringType())

    ds = ds.withColumn('metadata', metadata_str_to_struct_udf(ds['metadata']))
    ds = ds.withColumn('annotations',
                       annotations_str_to_struct_udf(ds['annotations']))
    ds = ds.withColumn('iauname',
                       subject_data_str_to_iauname_udf(ds['subject_data']))

    ds = ds.withColumn('person_id',
                       get_person_id_udf(ds['user_id'], ds['user_ip']))

    ds = ds.withColumnRenamed('subject_ids', 'subject_id')
    ds = ds.withColumn(
        'project_id', lit('5733')
    )  # TODO hardcoded for now as not in export. lit to make it a column, as Spark requires.

    flattened = flatten.api_df_to_responses(ds)

    flattened.write.parquet(save_dir, mode='overwrite')
Ejemplo n.º 19
0
  sess.run(tf.compat.v1.global_variables_initializer())
  sess.run(iterator.initializer, feed_dict={image_input: image_batch})
  softmax_tensor = sess.graph.get_tensor_by_name('softmax_tensor:0')
  result = []
  try:
    while True:
      batch = sess.run(image)
      preds = sess.run(softmax_tensor, {'input_tensor:0': batch})
      result = result + list(preds)
  except tf.errors.OutOfRangeError:
    pass

  return pd.Series(result)


df = spark.read.format("tfrecords").schema(schema).load(input_local_dir+'/flowers_train*.tfrecord')
df = df.limit(3200)


# In[ ]:



# image_batch = df.limit(128).toPandas().loc[: , "image/encoded"].apply(lambda x: bytes(x))
# images = predict_batch(image_batch)
# print(images.shape)
predict_batch_udf = pandas_udf(ArrayType(FloatType()), PandasUDFType.SCALAR)(predict_batch)
predictions = df.select(predict_batch_udf(col("image/encoded")).alias("prediction"))
predictions.write.mode("overwrite").save("/tmp/predictions")
result_df = spark.read.load("/tmp/predictions")
display(result_df)
Ejemplo n.º 20
0
def indices_to_terms(vocabulary):
    def indices_to_terms(xs):
        return [vocabulary[int(x)] for x in xs]

    return udf(indices_to_terms, ArrayType(StringType()))
Ejemplo n.º 21
0
                                        outputMode="vector")
transformer

fs = !ls content/train/*.jpg
uri_df = spark.createDataFrame(fs, StringType()).toDF("filename")
keras_pred_df = transformer.transform(uri_df)

from keras.models import Sequential
from keras.layers import Dense
import numpy as np
from pyspark.sql.types import StructType, StructField, ArrayType, FloatType

num_features = 10
num_examples = 100
input_data = [{"features" : np.random.randn(num_features).astype(float).tolist()} for i in range(num_examples)]
schema = StructType([ StructField("features", ArrayType(FloatType()), True)])
input_df = spark.createDataFrame(input_data, schema)


model = Sequential()
model.add(Dense(units=20, input_shape=[num_features], activation='relu'))
model.add(Dense(units=1, activation='sigmoid'))
model_path = "simple-binary-classification"
model.save('model_path')


transformer = KerasImageFileTransformer(inputCol="features", outputCol="category",
                                        modelFile='model_path',
                                        imageLoader=loadAndPreprocessKerasInceptionV3,
                                        outputMode="vector")
final_df = transformer.transform(input_df)
Ejemplo n.º 22
0
def create_test_scalar_dataset(tmp_url, num_rows, num_files=4, spark=None):
    shutdown = False
    if not spark:
        spark_session = SparkSession \
            .builder \
            .appName('petastorm_end_to_end_test') \
            .master('local[*]')

        spark = spark_session.getOrCreate()
        shutdown = True

    expected_data = [{
        'id':
        np.int32(i),
        'int_fixed_size_list':
        np.arange(1 + i, 10 + i).astype(np.int32),
        'datetime':
        np.datetime64('2019-01-02'),
        'timestamp':
        np.datetime64('2005-02-25T03:30'),
        'string':
        np.unicode_('hello_{}'.format(i)),
        'string2':
        np.unicode_('world_{}'.format(i)),
        'float64':
        np.float64(i) * .66
    } for i in range(num_rows)]

    expected_data_as_scalars = [{
        k: np.asscalar(v) if isinstance(v, np.generic) else v
        for k, v in row.items()
    } for row in expected_data]

    # np.datetime64 is converted to a timezone unaware datetime instances. Working explicitly in UTC so we don't need
    # to think about local timezone in the tests
    for row in expected_data_as_scalars:
        row['timestamp'] = row['timestamp'].replace(tzinfo=pytz.UTC)
        row['int_fixed_size_list'] = row['int_fixed_size_list'].tolist()

    rows = [Row(**row) for row in expected_data_as_scalars]

    # WARNING: surprisingly, schema fields and row fields are matched only by order and not name.
    # We must maintain alphabetical order of the struct fields for the code to work!!!
    schema = StructType([
        StructField('datetime', DateType(), False),
        StructField('float64', DoubleType(), False),
        StructField('id', IntegerType(), False),
        StructField('int_fixed_size_list', ArrayType(IntegerType(), False),
                    False),
        StructField('string', StringType(), False),
        StructField('string2', StringType(), False),
        StructField('timestamp', TimestampType(), False),
    ])

    dataframe = spark.createDataFrame(rows, schema)
    dataframe. \
        coalesce(num_files). \
        write.option('compression', 'none'). \
        mode('overwrite'). \
        parquet(tmp_url)

    if shutdown:
        spark.stop()

    return expected_data
Ejemplo n.º 23
0
    day = cfg['today']
    allocated = {'b1': 100, 'b2': 50}
    result = optimizer.dao.query_builder.index_bb(day, ands, minus, allocated,
                                                  es_client_bb)
    print(result)


if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='optimizer')
    parser.add_argument('config_file')
    parser.add_argument('today', help='today %Y%m%d e.g. 20181230')
    args = parser.parse_args()

    schema = StructType([
        StructField('day', StringType(), True),
        StructField('ands', ArrayType(StringType()), True),
        StructField('minus', ArrayType(StringType()), True),
        StructField('allocated', MapType(StringType(), IntegerType()), True),
        StructField('amount', IntegerType(), True)
    ])

    # Load config file
    try:
        with open(args.config_file, 'r') as ymlfile:
            cfg = yaml.load(ymlfile)
            cfg['today'] = args.today
    except Exception as e:
        print(e)

    test_9(cfg)
Ejemplo n.º 24
0
 StructField("locale", StringType(), True),
 StructField(
     "active_addons",
     # active_addons is a list of dictionaries holding all
     # metadata related to an addon
     ArrayType(
         StructType([
             StructField("addon_id", StringType(), True),
             StructField("app_disabled", BooleanType(), True),
             StructField("blocklisted", BooleanType(), True),
             StructField("foreign_install", BooleanType(), True),
             StructField("has_binary_components", BooleanType(), True),
             StructField("install_day", LongType(), True),
             StructField("is_system", BooleanType(), True),
             StructField("is_web_extension", BooleanType(), True),
             StructField("multiprocess_compatible", BooleanType(), True),
             StructField("name", StringType(), True),
             StructField("scope", LongType(), True),
             StructField("signed_state", LongType(), True),
             StructField("type", StringType(), True),
             StructField("update_day", LongType(), True),
             StructField("user_disabled", BooleanType(), True),
             StructField("version", StringType(), True),
         ]),
         True,
     ),
 ),
 StructField("places_bookmarks_count_mean", LongType(), True),
 StructField(
     "scalar_parent_browser_engagement_tab_open_event_count_sum",
     LongType(),
Ejemplo n.º 25
0
 def sqlType(cls):
     return StructType([
         StructField("type", ByteType(), False),
         StructField("size", IntegerType(), True),
         StructField("indices", ArrayType(IntegerType(), False), True),
         StructField("values", ArrayType(DoubleType(), False), True)])
import nltk
from functools import partial
from pyspark.sql.types import StructType, StructField,DoubleType,ArrayType,StringType


print "Start preprocessing all data"
t0 = time()

def preProcess(doc):
    clean = doc.review.replace("<br /><br />"," ")
    tok = nltk.tokenize.wordpunct_tokenize(clean)
    tags = nltk.pos_tag(tok,tagset='universal')
    low = [word.lower() for word in tok]
    return low,zip(*tags)[1],doc.label

schema = StructType([StructField('words',ArrayType(StringType()),True), StructField('tags',ArrayType(StringType()),True), StructField('label',DoubleType())])

dfPre=df.map(preProcess).toDF(schema).cache()
trigram = NGram(n=3,inputCol="tags", outputCol="tagTrigrams")
dfTriAux = trigram.transform(dfPre).cache()
trigram.setInputCol("words")
trigram.setOutputCol("wordTrigrams")
dfTri = trigram.transform(dfTriAux).cache()

dfTrain, dfValid = dfTri.randomSplit([0.8,0.2])


lists=dfTrain.map(lambda r : r.words).collect()
dictUnigrams=list(set(itertools.chain(*lists)))
dictionaryUni={}
for i,word in enumerate(dictUnigrams):
Ejemplo n.º 27
0
    "double": DoubleType,
    "boolean": BooleanType,
    "struct": StructType,
    "array": ArrayType,
    "bigint": LongType,
    "date": DateType,
    "byte": ByteType,
    "short": ShortType,
    "datetime": TimestampType,
    "binary": BinaryType,
    "null": NullType,
    "vector": VectorUDT
}
SPARK_DTYPES_DICT_OBJECTS = \
    {"string": StringType(), "int": IntegerType(), "float": FloatType(),
     "double": DoubleType(), "boolean": BooleanType(), "struct": StructType(), "array": ArrayType(StringType()),
     "bigint": LongType(), "date": DateType(), "byte": ByteType(), "short": ShortType(),
     "datetime": TimestampType(), "binary": BinaryType(), "null": NullType()
     }
PROFILER_COLUMN_TYPES = {
    "categorical", "numeric", "date", "null", "array", "binary"
}
PYTHON_TO_PROFILER = {
    "string": "categorical",
    "boolean": "categorical",
    "int": "numeric",
    "decimal": "numeric",
    "date": "date",
    "array": "array",
    "binaty": "binary",
    "null": "null"
# MAGIC   <img alt="Opens in new tab" src="https://files.training.databricks.com/static/images/external-link-icon-16x16.png"/>&nbsp;Watch full-screen.</a>
# MAGIC </div>

# COMMAND ----------

# MAGIC %md
# MAGIC The ZIP Code dataset contains an array with the latitude and longitude of the cities.  Use an `ArrayType`, which takes the primitive type of its elements as an argument.

# COMMAND ----------

from pyspark.sql.types import StructType, StructField, IntegerType, StringType, ArrayType, FloatType

zipsSchema3 = StructType([
  StructField("city", StringType(), True), 
  StructField("loc", 
    ArrayType(FloatType(), True), True),
  StructField("pop", IntegerType(), True)
])

# COMMAND ----------

# MAGIC %md
# MAGIC Apply the schema using the `.schema()` method and observe the results.  Expand the array values in the column `loc` to explore further.

# COMMAND ----------

zipsDF3 = (spark.read
  .schema(zipsSchema3)
  .json("/mnt/training/zips.json")
)
display(zipsDF3)
Ejemplo n.º 29
0
def get_hrv_features(rr_data, acceptable_percentage=50, window_length=60):
    """

    Args:
        rr_data (DataStream):
        acceptable_percentage (int):
        window_length (int):

    Returns:

    """
    stream_name = 'org.md2k.autosense.ecg.features'

    def get_metadata():
        stream_metadata = Metadata()
        stream_metadata.set_name(stream_name).set_description("HRV Features from ECG RR interval") \
            .add_input_stream(rr_data.metadata.get_name()) \
            .add_dataDescriptor(
            DataDescriptor()
                .set_name("var")
                .set_type("double")
                .set_attribute("description","variance")) \
            .add_dataDescriptor(
            DataDescriptor()
                .set_name("iqr")
                .set_type("double")
                .set_attribute("description","Inter Quartile Range")) \
            .add_dataDescriptor(
            DataDescriptor()
                .set_name("mean")
                .set_type("double")
                .set_attribute("description","Mean RR Interval")) \
            .add_dataDescriptor(
            DataDescriptor()
                .set_name("median")
                .set_type("double")
                .set_attribute("description","Median RR Interval")) \
            .add_dataDescriptor(
            DataDescriptor()
                .set_name("80th")
                .set_type("double")
                .set_attribute("description","80th percentile RR Interval")) \
            .add_dataDescriptor(
            DataDescriptor()
                .set_name("20th")
                .set_type("double")
                .set_attribute("description","20th percentile RR Interval")) \
            .add_dataDescriptor(
            DataDescriptor()
                .set_name("heartrate")
                .set_type("double")
                .set_attribute("description","Heart Rate in BPM")) \
            .add_dataDescriptor(
            DataDescriptor()
                .set_name("vlf")
                .set_type("double")
                .set_attribute("description","Very Low Frequency Energy")) \
            .add_dataDescriptor(
            DataDescriptor()
                .set_name("lf")
                .set_type("double")
                .set_attribute("description","Low Frequency Energy")) \
            .add_dataDescriptor(
            DataDescriptor()
                .set_name("hf")
                .set_type("double")
                .set_attribute("description","High Frequency Energy")) \
            .add_dataDescriptor(
            DataDescriptor()
                .set_name("lfhf")
                .set_type("double")
                .set_attribute("description","Low frequency to High Frequency energy ratio")) \
            .add_dataDescriptor(
            DataDescriptor()
                .set_name("window")
                .set_type("struct")
                .set_attribute("description","window start and end time in UTC")
                .set_attribute('start','start of window')
                .set_attribute('end','end of window')) \
            .add_module(
            ModuleMetadata().set_name("HRV Features from ECG RR Interval")
                .set_attribute("url", "http://md2k.org/")
                .set_attribute('algorithm','ecg feature computation')
                .set_attribute('unit','ms')
                .set_author("Md Azim Ullah", "*****@*****.**"))
        return stream_metadata

    def get_rr_features(a):
        return np.array([
            np.var(a),
            iqr(a),
            np.mean(a),
            np.median(a),
            np.percentile(a, 80),
            np.percentile(a, 20), 60000 / np.median(a)
        ])

    def frequencyDomain(RRints,
                        tmStamps,
                        band_type=None,
                        lf_bw=0.11,
                        hf_bw=0.1,
                        vlf=(0.003, 0.04),
                        lf=(0.04, 0.15),
                        hf=(0.15, 0.4)):
        """

        Args:
            RRints:
            tmStamps:
            band_type:
            lf_bw:
            hf_bw:
            vlf:
            lf:
            hf:

        Returns:

        """
        NNs = RRints
        tss = tmStamps
        frequency_range = np.linspace(0.001, 1, 10000)
        NNs = np.array(NNs)
        NNs = NNs - np.mean(NNs)
        result = signal.lombscargle(tss, NNs, frequency_range)

        #Pwelch w/ zero pad
        fxx = frequency_range
        pxx = result

        if band_type == 'adapted':

            vlf_peak = fxx[np.where(pxx == np.max(pxx[np.logical_and(
                fxx >= vlf[0], fxx < vlf[1])]))[0][0]]
            lf_peak = fxx[np.where(pxx == np.max(pxx[np.logical_and(
                fxx >= lf[0], fxx < lf[1])]))[0][0]]
            hf_peak = fxx[np.where(pxx == np.max(pxx[np.logical_and(
                fxx >= hf[0], fxx < hf[1])]))[0][0]]

            peak_freqs = (vlf_peak, lf_peak, hf_peak)

            hf = (peak_freqs[2] - hf_bw / 2, peak_freqs[2] + hf_bw / 2)
            lf = (peak_freqs[1] - lf_bw / 2, peak_freqs[1] + lf_bw / 2)
            vlf = (0.003, lf[0])

            if lf[0] < 0:
                print(
                    '***Warning***: Adapted LF band lower bound spills into negative frequency range'
                )
                print('Lower thresold of LF band has been set to zero')
                print('Adjust LF and HF bandwidths accordingly')
                lf = (0, lf[1])
                vlf = (0, 0)
            elif hf[0] < 0:
                print(
                    '***Warning***: Adapted HF band lower bound spills into negative frequency range'
                )
                print('Lower thresold of HF band has been set to zero')
                print('Adjust LF and HF bandwidths accordingly')
                hf = (0, hf[1])
                lf = (0, 0)
                vlf = (0, 0)

        df = fxx[1] - fxx[0]
        vlf_power = np.trapz(pxx[np.logical_and(fxx >= vlf[0], fxx < vlf[1])],
                             dx=df)
        lf_power = np.trapz(pxx[np.logical_and(fxx >= lf[0], fxx < lf[1])],
                            dx=df)
        hf_power = np.trapz(pxx[np.logical_and(fxx >= hf[0], fxx < hf[1])],
                            dx=df)
        totalPower = vlf_power + lf_power + hf_power

        #Normalize and take log
        vlf_NU_log = np.log((vlf_power / (totalPower - vlf_power)) + 1)
        lf_NU_log = np.log((lf_power / (totalPower - vlf_power)) + 1)
        hf_NU_log = np.log((hf_power / (totalPower - vlf_power)) + 1)
        lfhfRation_log = np.log((lf_power / hf_power) + 1)

        freqDomainFeats = {
            'VLF_Power': vlf_NU_log,
            'LF_Power': lf_NU_log,
            'HF_Power': hf_NU_log,
            'LF/HF': lfhfRation_log
        }

        return freqDomainFeats

    schema = StructType([
        StructField("timestamp", TimestampType()),
        StructField("start", TimestampType()),
        StructField("end", TimestampType()),
        StructField("localtime", TimestampType()),
        StructField("version", IntegerType()),
        StructField("user", StringType()),
        StructField("features", ArrayType(DoubleType()))
    ])

    @pandas_udf(schema, PandasUDFType.GROUPED_MAP)
    @CC_MProvAgg('org.md2k.autosense.ecg.rr', 'get_hrv_features', stream_name,
                 ['user', 'timestamp'], ['user', 'timestamp'])
    def ecg_r_peak(key, data):
        """

        Args:
            key:
            data:

        Returns:

        """
        if data.shape[0] >= acceptable_percentage * window_length / 100:
            data = data.sort_values('time')
            data['time'] = 1000 * data['time']
            a = data['rr'].values
            features = [
                np.double(
                    np.array(
                        list(get_rr_features(a)) + list(
                            frequencyDomain(
                                np.array(a) / 1000,
                                np.cumsum(a) / 1000).values())))
            ]
            data = data[:1]
            data['features'] = features
            data['start'] = [key[2]['start']]
            data['end'] = [key[2]['end']]
            data = data[[
                'timestamp', 'localtime', 'version', 'user', 'start', 'end',
                'features'
            ]]
            return data
        else:
            return pd.DataFrame([],
                                columns=[
                                    'timestamp', 'localtime', 'version',
                                    'user', 'features', 'start', 'end'
                                ])

    rr_data = rr_data.withColumn('time', F.col('timestamp').cast('double'))
    ecg_features = rr_data.compute(ecg_r_peak,
                                   windowDuration=window_length,
                                   startTime='0 seconds')
    df = ecg_features.select('timestamp',
                             F.struct('start', 'end').alias('window'),
                             'localtime', 'features', 'user', 'version')
    df = df.withColumn('var', F.col('features').getItem(0))
    df = df.withColumn('iqr', F.col('features').getItem(1))
    df = df.withColumn('vlf', F.col('features').getItem(7))
    df = df.withColumn('lf', F.col('features').getItem(8))
    df = df.withColumn('hf', F.col('features').getItem(9))
    df = df.withColumn('lfhf', F.col('features').getItem(10))
    df = df.withColumn('mean', F.col('features').getItem(2))
    df = df.withColumn('median', F.col('features').getItem(3))
    df = df.withColumn('80th', F.col('features').getItem(4))
    df = df.withColumn('20th', F.col('features').getItem(5))
    ecg_features_final = df.withColumn('heartrate',
                                       F.col('features').getItem(6))
    ecg_features_final = ecg_features_final.drop('features')

    feature_names = [
        'var', 'iqr', 'mean', 'median', '80th', '20th', 'heartrate', 'vlf',
        'lf', 'hf', 'lfhf'
    ]
    stress_features = ecg_features_final.withColumn(
        'features', F.array([F.col(i) for i in feature_names]))
    stress_features.metadata = get_metadata()

    return stress_features
Ejemplo n.º 30
0
    resultMap_FilterUlr =  resultMap.map(lambda (a,b,c,d,e,f,g,h,i,j,l): (a,b,c,d,e,f,g,h,i,j,regularExpression(l.split(",")))). \
                                    filter(lambda (a,b,c,d,e,f,g,h,i,j,l): len(l) >1)

    #put on Json
    fields = StructType( \
                        [StructField("GSN", StringType(), False),  \
                        StructField("ChargingID", IntegerType(), False),  \
                        StructField("RecordSequence", IntegerType(), False),  \
                        StructField("RecordOpeningDate", TimestampType(), False),  \
                        StructField("rATType", IntegerType(), False),  \
                        StructField("UserLocation", StringType(), False),  \
                        StructField("Accuracy", IntegerType(), False),  \
                        StructField("BrowsingSession", IntegerType(), False),  \
                        StructField("Uplink", IntegerType(), False),  \
                        StructField("Downlink", IntegerType(), False), \
                        StructField("Urls", ArrayType(StringType(),False))])

    #The new Json Format
    newStructure = StructType( \
                        [StructField("GSN", StringType(), False),  \
                        StructField("ChargingID", IntegerType(), False),  \
                        StructField("RecordSequence", IntegerType(), False),  \
                        StructField("RecordOpeningDate", TimestampType(), False),  \
                        StructField("rATType", IntegerType(), False),  \
                        StructField("UserLocation", StringType(), False),  \
                        StructField("Accuracy", IntegerType(), False),  \
                        StructField("BrowsingSession", IntegerType(), False),  \
                        StructField("Uplink", IntegerType(), False),  \
                        StructField("Downlink", IntegerType(), False), \
                        StructField("Urls",ArrayType( \
                                                     StructType([StructField("name", StringType(), False),StructField("domain", StringType(), True), \