############# ############# ############# ############# ############# # filterData # by JAG3 # ############# ############# ############# ############# ############# from pyspark import SparkConf, SparkContext from pyspark.sql import SQLContext, Row from pyspark.sql.types import BooleanType from datetime import date import sys import argparse sys.path.insert(0, './lib/') from to_parquet import csvToDataFrame import fspLib import shapeReader # HARD CODE YOU INPUT DATA SETS AND DATA TYPES DATA_SETS = {"hdfs://xdata/qcr/gnip": 66} LOWER_TIME = date(2006, 03, 21) UPPER_TIME = date(3000, 01, 01) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("shapeFile", help="The shape file path") parser.add_argument("outputPath", help="Output destination") parser.add_argument("-jobNm", help="Application name, default = 'Geqe Data Filter'", default='Geqe data filter.') parser.add_argument( "-cNum", type=int, help=
dropoff_centroid_longitude dropoff_centroid_location""" # partion non-string type column types bool_fields = set(["shared_trip_authorized"]) float_fields = set([ "trip_seconds", "trip_miles", "fare", "tip", "additional_charges", "trip_total" ]) int_fields = set(["trips_pooled"]) # for each column name, assign it a specific type fields = [ StructField(field_name, BooleanType()) if field_name in bool_fields else StructField(field_name, FloatType()) if field_name in float_fields else StructField(field_name, IntegerType()) if field_name in int_fields else StructField(field_name, StringType()) for field_name in schemaString.split("\n") ] # store schema schema = StructType(fields) # start spark session ---- spark = (SparkSession.builder.master("local[1]").appName( "Python Spark SQL example").getOrCreate()) # load necessary data ----
from pyspark.sql import SparkSession from pyspark.sql import functions as F from pyspark.sql.types import StructField, StructType, StringType, BooleanType, ArrayType, DateType, FloatType # create a StructType for the Kafka redis-server topic which has all changes made # to Redis - before Spark 3.0.0, schema inference is not automatic KAFKA_HOST = "kafka:19092" SPARK_HOST = "spark://spark:7077" redisRawMessageSchema = StructType([ StructField("key", StringType()), StructField("existType", StringType()), StructField("Ch", BooleanType()), StructField("Incr", BooleanType()), StructField( "zSetEntries", ArrayType( StructType([ StructField("element", StringType()), StructField("Score", StringType()) ]))) ]) # create a StructType for the Customer JSON that comes # from Redis- before Spark 3.0.0, schema inference is not automatic redisCustomerSchema = StructType([ StructField("customerName", StringType()), StructField("email", StringType()), StructField("phone", StringType()),
def make_not_terminal_udf(): """ Return true iff next_action is an empty map """ def get_not_terminal(next_action): return len(next_action) > 0 return udf(get_not_terminal, BooleanType())
secret = open(app_secrets_path) app_secret = yaml.load(secret, Loader=yaml.FullLoader) # Setup spark to use s3 hadoop_conf = spark.sparkContext._jsc.hadoopConfiguration() hadoop_conf.set("fs.s3a.access.key", app_secret["s3_conf"]["access_key"]) hadoop_conf.set("fs.s3a.secret.key", app_secret["s3_conf"]["secret_access_key"]) print( "\nCreating dataframe ingestion CSV file using 'SparkSession.read.format()'" ) fin_schema = StructType() \ .add("id", IntegerType(), True) \ .add("has_debt", BooleanType(), True) \ .add("has_financial_dependents", BooleanType(), True) \ .add("has_student_loans", BooleanType(), True) \ .add("income", DoubleType(), True) fin_df = spark.read \ .option("header", "false") \ .option("delimiter", ",") \ .format("csv") \ .schema(fin_schema) \ .load("s3a://" + app_conf["s3_conf"]["s3_bucket"] + "/finances.csv") fin_df.printSchema() fin_df.show() print(
# spark-submit --master yarn --num-executors 10 --executor-cores 5 --executor-memory 16G --driver-memory 32G --conf spark.driver.maxResultSize=10G trainready_data_cmp.py sc = SparkContext.getOrCreate() sc.setLogLevel('WARN') hive_context = HiveContext(sc) df = hive_context.read.format('com.databricks.spark.csv').options( header='true').load('train_ready_bad_uckey_removal_10percent_denoise.csv') c = df.columns del c[0] df = df.withColumn( 'ts', udf(lambda x: [x[_] for _ in c], ArrayType(StringType()))(fn.struct(c))) df = df.withColumn('sparse', udf(lambda x: 'vir' in x, BooleanType())(col('_c0'))) df = df.filter('sparse==False') df = df.withColumn( 'imp', udf(lambda x: int(sum([float(_) for _ in x if _])), IntegerType())(df.ts)) df = df.select('_c0', 'ts', 'sparse', 'imp') # df.show(1, False) jimmy = df.collect() jimmy_dict = {} for _ in jimmy: jimmy_dict[_['_c0']] = _ df = hive_context.sql('select * from dlpm_06242021_1635_trainready') df = df.withColumn('sparse', udf(lambda x: ',' not in x, BooleanType())(col('uckey'))) df = df.filter('sparse==False')
return df #BOOLEAN Savoir si le match est joué à domicile def resultat_a_domicile_oui_non(dframe): if dframe[0:6] == 'France': return True else: return False #Rajout de la colonne A Domicile avec la réponse du boolean def a_domicile(dframe): df = dframe.withColumn('a_Domicile', resultat_a_domicile_oui_non(dframe.match)); return df resultat_a_domicile_oui_non = F.udf(resultat_a_domicile_oui_non, BooleanType()) #Savoir si ils sont champion du monde def en_coupe_du_monde(competition_colonne): if competition_colonne[:5] == 'Coupe': return 1 else: return 0 jouer_en_coupe_du_monde = F.udf(en_coupe_du_monde, IntegerType()) def statistiques(dframe): df = (dframe .groupBy("adversaire") .agg(
def getCrosswalkDF(spark=None, columns=None, strong_mcd_states=STRONG_MCD_STATES, aian_areas=AIAN_AREAS, aian_ranges_path=AIAN_RANGES_PATH, fed_airs=FED_AIRS): """ Loads the 2010 crosswalk files that Simson generated from the 2010 GRFC into a Spark DF Parameters ========== spark : SparkSession columns : str or list of str (default is None, which will return all columns in the file) - This determines which columns survive from the original crosswalk data file, as the function will only return a Spark DF with the columns listed here Returns ======= a Spark DF containing crosswalk columns Notes ===== - This function also generates a number of additional columns to expand the ease-of-use when aggregating blocks to form geographic units in different geographic levels. - e.g. Rather than COUNTY being the 3-digit FIPS code, the COUNTY column will concatenate both the 2-digit STATE FIPS code and the 3-digit COUNTY FIPS code to create a 5-digit COUNTY code that is unique from all other 5-digit COUNTY codes. """ crosswalk = f"{DAS_S3ROOT}/2010/geounit_crosswalks/24vars/" crossdf = spark.read.option("header", "true").csv(crosswalk) # add "geocode" column based on GEOID (which is the 16 digit block id) crossdf = crossdf.withColumn("geocode", crossdf['GEOID']) # generate unique counties crossdf = crossdf.withColumn("COUNTY", sf.concat(sf.col("STATE"), sf.col("COUNTY"))) # generate unique tract groups crossdf = crossdf.withColumn("TRACT_GROUP", sf.concat(sf.col("County"), crossdf.TRACT[0:4])) # generate unique tracts crossdf = crossdf.withColumn("TRACT", sf.concat(sf.col("COUNTY"), sf.col("TRACT"))) # generate block group column crossdf = crossdf.withColumn("BLOCK_GROUP", crossdf.BLOCK[0:1]) # generate unique block groups crossdf = crossdf.withColumn("BLOCK_GROUP", sf.concat(sf.col("TRACT"), sf.col("BLOCK_GROUP"))) # generate unique blocks crossdf = crossdf.withColumn("BLOCK", sf.concat(sf.col("BLOCK_GROUP"), sf.col("BLOCK"))) # generate unique SLDLs (only unique if state fips has been prepended to the SLDL identifier) crossdf = crossdf.withColumn("SLDL", sf.concat(sf.col("STATE"), sf.col("SLDL"))) # generate unique SLDUs (only unique if state fips has been prepended to the SLDU identifier) crossdf = crossdf.withColumn("SLDU", sf.concat(sf.col("STATE"), sf.col("SLDU"))) # generate unique Congressional Districts (111th Congress) - only unique if state fips has been prepended to the CD identifier crossdf = crossdf.withColumn("CD", sf.concat(sf.col("STATE"), sf.col("CD"))) # generate unique school districts (only unique if state fips has been prepended to the identifiers) crossdf = crossdf.withColumn("SDELM", sf.concat(sf.col("STATE"), sf.col("SDELM"))) crossdf = crossdf.withColumn("SDSEC", sf.concat(sf.col("STATE"), sf.col("SDSEC"))) crossdf = crossdf.withColumn("SDUNI", sf.concat(sf.col("STATE"), sf.col("SDUNI"))) # generate unique urban areas and urban growth areas (only unique if state prepended) crossdf = crossdf.withColumn("UA", sf.concat(sf.col("STATE"), sf.col("UA"))) crossdf = crossdf.withColumn("UGA", sf.concat(sf.col("STATE"), sf.col("UGA"))) # generate unique puma and place ids (only unique if state prepended) crossdf = crossdf.withColumn("PUMA", sf.concat(sf.col("STATE"), sf.col("PUMA"))) crossdf = crossdf.withColumn("PLACE", sf.concat(sf.col("STATE"), sf.col("PLACE"))) # generate unique county subdivisions (only unique if state and county prepended) crossdf = crossdf.withColumn("COUSUB", sf.concat(sf.col("COUNTY"), sf.col("COUSUB"))) # generate unique subminor civil divisions (only unique if state, county, and county subdivisions prepended) crossdf = crossdf.withColumn("SUBMCD", sf.concat(sf.col("COUSUB"), sf.col("SUBMCD"))) # voting districts appear to have a floating space (" ") character in every VTD code, so we'll remove them as they # don't appear in the BlockAssign files for VTD ### Update - 2019-06-25 - The floating space is a valid character in the 6-character VTD codes; the first character # isn't always a " ", so " " is just another part of the code. #crossdf = crossdf.withColumn("VTD1st", crossdf.VTD[0:1]) # generate unique voting districts (only unique if state and county prepended) crossdf = crossdf.withColumn("VTD", sf.concat(sf.col("COUNTY"), sf.col("VTD"))) # create a column for the nation crossdf = crossdf.withColumn("US", sf.lit("Nation")) # Note: When using any of the columns from the next block, filter out IDs composed only of "9"'s aian_ranges_dict = make_aian_ranges_dict(aian_ranges_path, aian_areas) is_fed_air_udf = udf(lambda aiannhce: in_aian_class(aiannhce, fed_airs, aian_ranges_dict), BooleanType()) is_aian_udf = udf(lambda aiannhce: in_aian_class(aiannhce, aian_areas, aian_ranges_dict), BooleanType()) crossdf = add_aiannhce_col(spark, crossdf) # aian_areas: crossdf = crossdf.withColumn("AIAN_AREAS", sf.when(is_aian_udf("AIANNHCE"), sf.col("AIANNHCE")).otherwise(CC.NOT_AN_AIAN_AREA)) crossdf = crossdf.withColumn("FED_AIRS", sf.when(is_fed_air_udf("AIANNHCE"), sf.col("AIANNHCE")).otherwise(CC.NOT_AN_AIAN_AREA)) # portions of Blocks/Tracts/States within aian_areas: crossdf = crossdf.withColumn("AIANBlock", sf.when(sf.col("AIAN_AREAS") != CC.NOT_AN_AIAN_AREA, sf.col("BLOCK")).otherwise(CC.NOT_AN_AIAN_BLOCK)) crossdf = crossdf.withColumn("AIANTract", sf.col("AIANBlock")[0:11]) crossdf = crossdf.withColumn("AIANState", sf.col("AIANTract")[0:2]) # Define an off-spine entity (OSE) as Place in AIAN areas/ non-strong-MCD states and MCD otherwise: crossdf = crossdf.withColumn("OSE", sf.when((sf.col("AIAN_AREAS") == CC.NOT_AN_AIAN_AREA) & (sf.col("STATE").isin(strong_mcd_states)), sf.col("COUSUB")).otherwise(sf.col("PLACE"))) crossdf = crossdf.withColumn("COUNTY_NSMCD", sf.when(sf.col("STATE").isin(strong_mcd_states), CC.STRONG_MCD_COUNTY).otherwise(sf.col("COUNTY"))) crossdf = crossdf.withColumn("MCD", sf.when(sf.col("STATE").isin(strong_mcd_states), sf.col("COUSUB")).otherwise(sf.lit(CC.NOT_A_MCD))) if columns is None: columns = crossdf.columns else: # always want 'geocode' (aka Block ID, GEOID) in the crosswalk dataframe columns = np.unique(du.aslist(columns) + ['geocode']).tolist() crossdf = crossdf.select(columns) return crossdf
def main(sc): spark = SparkSession(sc) #b= b.select('L_LOW_HN', 'L_HIGH_HN','FULL_STREE','ST_LABEL','BOROCODE','PHYSICALID') b = spark.read.load('centerline (1).csv', format='csv', header=True, inferSchema=True) b.head(10) b = b.withColumn('New L_LOW_HN', funct.concat_ws('.', funct.split(b['L_LOW_HN'], '-'))) b = b.withColumn('New R_LOW_HN', funct.concat_ws('.', funct.split(b['R_LOW_HN'], '-'))) b = b.withColumn('New L_HIGH_HN', funct.concat_ws('.', funct.split(b['L_HIGH_HN'], '-'))) b = b.withColumn('New R_HIGH_HN', funct.concat_ws('.', funct.split(b['R_HIGH_HN'], '-'))) #b.select('BOROCODE').distinct().show() b = b.select('NEW L_LOW_HN', 'NEW L_HIGH_HN', 'NEW R_LOW_HN', 'NEW R_HIGH_HN', 'FULL_STREE', 'ST_LABEL', 'BOROCODE', 'PHYSICALID') b.head(100) # In[7]: a = spark.read.load('small_data1.csv', format='csv', header=True, inferSchema=True) #a.head() a = a.withColumn('Year', funct.split(a['Issue Date'], '/').getItem(2)) a = a.filter((a['Year'] >= '2015') | (a['Year'] <= '2019')) a = a.filter(a['House Number'].rlike('^[0-9]*\-*[0-9]*$')) a = a.withColumn('New House Number', funct.concat_ws('.', funct.split(a['House Number'], '-'))) a = a.select('Violation County', 'New House Number', 'Street Name', 'Year') a.head(10) def process(c, d, e, f, g, h, i, j, k, l, m, n): a = [c, d, e, f] b = [g, h, i, j, k, l, m, n] # if a[0] == 'K': # return True boro = { 3: ['K', 'KINGS', 'KING', 'BK'], 2: ['BX', 'BRONX'], 1: ['NY', 'MAN', 'MH', 'NEW Y', 'NEWY', 'MN'], 5: ['R', 'RICHMOND'], 4: ['Q', 'QU', 'QUEEN', 'QN', 'QNS'] } if c in boro[m]: # return True if ((b[4] == a[2]) | (b[5] == a[2])): return True # if float(a[1])%2 == 0: # if ((float(b[3]) <= float(a[1]) ) & (float(b[2] )>= float(row[1]))): # return True # else: # if ((float(b[1]) <= float(a[1]) ) & (float(b[0] )>= float(row[1]))): # return True return False acol = a.columns bcol = b.columns p = funct.udf(process, BooleanType()) v = a.crossJoin(b).where( p(a['Violation County'], a['New House Number'], a['Street Name'], a['Year'], b['NEW L_LOW_HN'], b['NEW L_HIGH_HN'], b['NEW R_LOW_HN'], b['NEW R_HIGH_HN'], b['FULL_STREE'], b['ST_LABEL'], b['BOROCODE'], b['PHYSICALID'])) v = v.groupBy("Year", "PHYSICALID").count() v.show(10)
'tolls_amt': 'tolls_amount', 'total_amt': 'total_amount', } schema = StructType([ StructField('vendor_name', StringType(), False), StructField('pickup_datetime', TimestampType(), False), StructField('dropoff_datetime', TimestampType(), False), StructField('passenger_count', IntegerType(), False), StructField('trip_distance', FloatType(), False), StructField('pickup_latitude', DoubleType(), False), StructField('pickup_longitude', DoubleType(), False), StructField('ratecode_id', IntegerType(), False), StructField('pickup_location_id', IntegerType(), False), StructField('dropoff_location_id', IntegerType(), False), StructField('store_and_forward_flag', BooleanType(), False), StructField('dropoff_latitude', DoubleType(), False), StructField('dropoff_longitude', DoubleType(), False), StructField('payment_type', IntegerType(), False), StructField('fare_amount', FloatType(), False), StructField('surcharge', FloatType(), False), StructField('improvement_surcharge', FloatType(), False), StructField('congestion_surcharge', FloatType(), False), StructField('mta_tax', FloatType(), False), StructField('tip_amount', FloatType(), False), StructField('tolls_amount', FloatType(), False), StructField('total_amount', FloatType(), False), ]) def payment_type_f(v):
if len(sys.argv) < 1: print('Usage: ' + sys.argv[0] + ' <database>') sys.exit(1) # Grab the parameters database = sys.argv[1] # Create a spark context for the job. The context is used to manage the job at a high level. appName = "ETL-%s" % database spark = SparkSession \ .builder \ .appName(appName) \ .getOrCreate() # Register UDFs udfIsDurationCorrect = udf(isDurationCorrect, BooleanType()) udfRemoveBraces = udf(removeBraces, StringType()) # Read in the dataset logs = spark.read.csv("/incoming/logs/upload", sep="\t", inferSchema=True, header="True") # Process the dataset streams_raw = logs.filter(logs['eventType'] == 'SongPlayed') streams_projected = streams_raw.drop('eventType') streams = streams_projected.withColumnRenamed('itemId', 'trackId') streams_correct = streams.filter(udfIsDurationCorrect('duration')) streams_cleaned = streams_correct.select( udfRemoveBraces('ts').alias('ts'), 'host', 'userId', 'trackId',
def test_as_spark_type_pandas_on_spark_dtype(self): type_mapper = { # binary np.character: (np.character, BinaryType()), np.bytes_: (np.bytes_, BinaryType()), np.string_: (np.bytes_, BinaryType()), bytes: (np.bytes_, BinaryType()), # integer np.int8: (np.int8, ByteType()), np.byte: (np.int8, ByteType()), np.int16: (np.int16, ShortType()), np.int32: (np.int32, IntegerType()), np.int64: (np.int64, LongType()), np.int: (np.int64, LongType()), int: (np.int64, LongType()), # floating np.float32: (np.float32, FloatType()), np.float: (np.float64, DoubleType()), np.float64: (np.float64, DoubleType()), float: (np.float64, DoubleType()), # string np.str: (np.unicode_, StringType()), np.unicode_: (np.unicode_, StringType()), str: (np.unicode_, StringType()), # bool np.bool: (np.bool, BooleanType()), bool: (np.bool, BooleanType()), # datetime np.datetime64: (np.datetime64, TimestampType()), datetime.datetime: (np.dtype("datetime64[ns]"), TimestampType()), # DateType datetime.date: (np.dtype("object"), DateType()), # DecimalType decimal.Decimal: (np.dtype("object"), DecimalType(38, 18)), # ArrayType np.ndarray: (np.dtype("object"), ArrayType(StringType())), List[bytes]: (np.dtype("object"), ArrayType(BinaryType())), List[np.character]: (np.dtype("object"), ArrayType(BinaryType())), List[np.bytes_]: (np.dtype("object"), ArrayType(BinaryType())), List[np.string_]: (np.dtype("object"), ArrayType(BinaryType())), List[bool]: (np.dtype("object"), ArrayType(BooleanType())), List[np.bool]: (np.dtype("object"), ArrayType(BooleanType())), List[datetime.date]: (np.dtype("object"), ArrayType(DateType())), List[np.int8]: (np.dtype("object"), ArrayType(ByteType())), List[np.byte]: (np.dtype("object"), ArrayType(ByteType())), List[decimal.Decimal]: (np.dtype("object"), ArrayType(DecimalType(38, 18))), List[float]: (np.dtype("object"), ArrayType(DoubleType())), List[np.float]: (np.dtype("object"), ArrayType(DoubleType())), List[np.float64]: (np.dtype("object"), ArrayType(DoubleType())), List[np.float32]: (np.dtype("object"), ArrayType(FloatType())), List[np.int32]: (np.dtype("object"), ArrayType(IntegerType())), List[int]: (np.dtype("object"), ArrayType(LongType())), List[np.int]: (np.dtype("object"), ArrayType(LongType())), List[np.int64]: (np.dtype("object"), ArrayType(LongType())), List[np.int16]: (np.dtype("object"), ArrayType(ShortType())), List[str]: (np.dtype("object"), ArrayType(StringType())), List[np.unicode_]: (np.dtype("object"), ArrayType(StringType())), List[datetime.datetime]: (np.dtype("object"), ArrayType(TimestampType())), List[np.datetime64]: (np.dtype("object"), ArrayType(TimestampType())), # CategoricalDtype CategoricalDtype(categories=["a", "b", "c"]): ( CategoricalDtype(categories=["a", "b", "c"]), LongType(), ), } for numpy_or_python_type, (dtype, spark_type) in type_mapper.items(): self.assertEqual(as_spark_type(numpy_or_python_type), spark_type) self.assertEqual(pandas_on_spark_type(numpy_or_python_type), (dtype, spark_type)) with self.assertRaisesRegex(TypeError, "Type uint64 was not understood."): as_spark_type(np.dtype("uint64")) with self.assertRaisesRegex(TypeError, "Type object was not understood."): as_spark_type(np.dtype("object")) with self.assertRaisesRegex(TypeError, "Type uint64 was not understood."): pandas_on_spark_type(np.dtype("uint64")) with self.assertRaisesRegex(TypeError, "Type object was not understood."): pandas_on_spark_type(np.dtype("object"))
StructField('ResponseDtTm', StringType(), True), StructField('OnSceneDtTm', StringType(), True), StructField('TransportDtTm', StringType(), True), StructField('HospitalDtTm', StringType(), True), StructField('CallFinalDisposition', StringType(), True), StructField('AvailableDtTm', StringType(), True), StructField('Address', StringType(), True), StructField('City', StringType(), True), StructField('ZipcodeofIncident', IntegerType(), True), StructField('Battalion', StringType(), True), StructField('StationArea', StringType(), True), StructField('Box', StringType(), True), StructField('OriginalPriority', StringType(), True), StructField('Priority', StringType(), True), StructField('FinalPriority', IntegerType(), True), StructField('ALSUnit', BooleanType(), True), StructField('CallTypeGroup', StringType(), True), StructField('NumberofAlarms', IntegerType(), True), StructField('UnitType', StringType(), True), StructField('Unitsequenceincalldispatch', IntegerType(), True), StructField('FirePreventionDistrict', StringType(), True), StructField('SupervisorDistrict', StringType(), True), StructField('NeighborhoodDistrict', StringType(), True), StructField('Location', StringType(), True), StructField('RowID', StringType(), True) ]) # read the file using DataFrameReader using format CSV fire_df = spark.read.csv(sf_fire_file, header=True, schema=fire_schema) fire_df.show(10)
.load("s3a://living-insight-data/DOB_NOW__Build___Approved_Permits.csv") mental_health = spark.read.format("csv") \ .option("header","true") \ .option("inferSchema","true") \ .load("s3a://living-insight-data/Mental_Health_Service_Finder_Data.csv") buildings_rdd = buildings.limit(1000).rdd.map(processhouse) buildings = buildings_rdd.toDF() buildings.write.jdbc("jdbc:postgresql://localhost:5432/living_insight", table = 'buildings', properties = { "user": "******", "password" : "postgres" }) mental_health_rdd = mental_health.rdd.zipWithIndex().map(processmentalhealth) mental_health = mental_health_rdd.toDF() mental_health = mental_health.filter(mental_health.longitude.isNotNull()) mental_udf = udf(handle_building,BooleanType()) house_id_with_mental_health = buildings.crossJoin(mental_health).where(mental_udf(struct([buildings[x] for x in buildings.columns]), struct([mental_health[x] for x in mental_health.columns]))).select(buildings.house_id,mental_health.query_id) mental_health.write.jdbc("jdbc:postgresql://localhost:5432/living_insight", table = 'mental_health', properties = { "user": "******", "password" : "postgres" }) house_id_with_mental_health.write.jdbc("jdbc:postgresql://localhost:5432/living_insight", table = 'house_id_mental_health', properties = { "user": "******", "password" : "postgres" }) print("--- %s seconds ---" % (time.time() - start_time)) spark.stop()
def test_train_val_split_col_boolean(self): with spark_session('test_train_val_split_col_boolean') as spark: data = [ [1.0, False], [1.0, False], [1.0, False], [1.0, False], [1.0, True] ] schema = StructType([StructField('data', FloatType()), StructField('val', BooleanType())]) df = create_test_data_from_schema(spark, data, schema) validation = 'val' train_df, val_df, validation_ratio = util._train_val_split(df, validation) # Only check counts as validation ratio cannot be guaranteed due to approx calculation assert train_df.count() == 4 assert val_df.count() == 1
StructField('description', StringType(), True), StructField('amount', StringType(), True), StructField('day_number', StringType(), True), StructField('weekday', StringType(), True), ] WRONG_DEFAULT_STRUCT = StructType(fields=wrong_default_types) correct_fortnightly_regularity_types = [ StructField('date', DateType(), True), StructField('description', StringType(), True), StructField('amount', DoubleType(), True), StructField('day_number', StringType(), True), StructField('weekday', StringType(), True), StructField('days_passed', IntegerType(), True), StructField('fortnightly', BooleanType(), True) ] FINAL_FORTNIGHTLY_STRUCT = StructType( fields=correct_fortnightly_regularity_types) wrong_fortnightly_regularity_types = [ StructField('date', StringType(), True), StructField('description', StringType(), True), StructField('amount', DoubleType(), True), StructField('day_number', StringType(), True), StructField('weekday', StringType(), True), StructField('days_passed', StringType(), True), StructField('fortnightly', StringType(), True) ]
def process_bus_data(bus_df): """ Method to process raw business data from Yelp.""" def select_elibigble_bus(row): """ Select businesses which fall into selected categores.""" global categories try: # Return true if business falls into category list, else false. row_cats = row.split(',') for cat in row_cats: if cat.strip() in categories: return True return False except (TypeError, AttributeError): # Returns false if business has no defined categories. return False def unpack_bus_attributes(row): """ Unpacks Business attributes and assigns them an index value.""" # List to store business attributes. unpacked = list() # Unpack all attributes except PriceRange and Parking temp = [row[s] for s in bus_attributes] # Process PriceRange try: priceRange = int(row["RestaurantsPriceRange2"]) except (TypeError, ValueError): # If no price range specified - default=2 priceRange = 2 #Process Parking try: parking = 1 if (row["BusinessParking"].find("True")) != -1 else -1 except AttributeError: parking = 0 # Process WiFi if row["WiFi"] == 'no' or row["WiFi"] == "u'no'": wifi = -1 elif row["WiFi"] == None: wifi = 0 else: wifi = 1 # Tokenize all Boolean attributes. for i in temp: if i == "True": unpacked.append(1) elif i == "False": unpacked.append(-1) else: unpacked.append(0) # Append the Parking and PriceRange attributes unpacked.append(wifi) unpacked.append(parking) unpacked.append(priceRange) # Print any arrays that are not of desired length (=30). if len(unpacked) != 30: print(unpacked) return _convert_to_vector( csc_matrix(np.asarray(unpacked).astype(float)).T) def unpack_bus_categories(row): """Unpacks all business cattegories.""" # List to store business categories. unpacked = list() # Unpack all attributes except PriceRange and Parking for cat in row.split(','): unpacked.append(cat.strip()) return unpacked def unpack_price_range(row): """ Returns price range.""" return int(row[-1]) # Package the functions above into Spark SQL user-defined functions udf_select_eligible_bus = udf(select_elibigble_bus, BooleanType()) udf_unpack_bus_attributes = udf(unpack_bus_attributes, VectorUDT()) udf_unpack_bus_categories = udf(unpack_bus_categories, ArrayType(StringType())) udf_unpack_price_range = udf(unpack_price_range, IntegerType()) # Find businesses to include. eligible_bus = bus_df.withColumn("include", udf_select_eligible_bus(col("categories"))) \ .filter(col("include") == True) # Process business attributes feature. all_bus_attributes = set( bus_df.select("attributes").take(1)[0].attributes.asDict().keys()) bus_attributes_to_exclude = { 'AcceptsInsurance', 'AgesAllowed', 'ByAppointmentOnly', 'Caters', 'Corkage', 'DietaryRestrictions', 'HairSpecializesIn', 'Open24Hours', 'RestaurantsAttire', 'RestaurantsPriceRange2', 'BusinessParking', 'WiFi' } bus_attributes = list(all_bus_attributes - bus_attributes_to_exclude) bus_attributes.sort() eligible_attr = eligible_bus.withColumn( "unpackedAttr", udf_unpack_bus_attributes(col("attributes"))) # Process business categories feature. eligible_cats = eligible_attr.withColumn( "unpackedCats", udf_unpack_bus_categories(col("categories"))) cv = CountVectorizer(inputCol="unpackedCats", outputCol="vectorizedCats") vectorized_cats = cv.fit(eligible_cats).transform(eligible_cats) # Un-bundle price range from all other attributes. unpacked_pr = vectorized_cats.withColumn( "priceRange", udf_unpack_price_range(col("unpackedAttr"))) unpacked_pr.take(1) # Reduce dimensions of attributes and categories features, respectively. pca_attr = PCA(k=3, inputCol="unpackedAttr", outputCol="pcaAttr").fit(unpacked_pr) temp = pca_attr.transform(unpacked_pr) temp.show() pca_cats = PCA(k=1, inputCol="vectorizedCats", outputCol="pcaCats").fit(temp) temp2 = pca_cats.transform(temp) temp2.show() # Assemble into final feature vector. va = VectorAssembler( inputCols=["stars", "priceRange", "pcaAttr", "pcaCats"], outputCol="featureVec") features = va.transform(temp2).select("business_id", "stars", "categories", "featureVec") features.take(1) # Unpack n_features = len(features.select("featureVec").take(1)[0].featureVec) final = features.withColumn("f", vector_to_array(col("featureVec"))) \ .select(["business_id", "stars", "categories"] + [col("f")[i] for i in range(n_features)]) return final, n_features
from json import loads from pyspark.sql import SparkSession from pyspark.sql.dataframe import StructType, StructField, DataFrame from pyspark.sql.types import ArrayType, IntegerType, LongType, BooleanType from pyspark.ml.linalg import Vectors, VectorUDT from pyspark.sql.functions import udf, col sc = SparkSession.builder.appName("dotingestion").getOrCreate() schema = StructType([StructField("dire_lineup", ArrayType(IntegerType(), False), False), StructField("radiant_lineup", ArrayType(IntegerType(), False), False), StructField("radiant_win", BooleanType(), False), StructField("match_id", LongType(), False)]) path = "data.json" df = sc.read.json(path, schema=schema).na.drop("all").distinct() with open("heroes.json", 'r', encoding="utf-8") as f: heroes_dict = {hero['id']: i for i, hero in enumerate(loads(f.read()))} def convert_heroes_to_lineup(df: DataFrame) -> DataFrame: def onehot(heroes: ArrayType): lineup = tuple(heroes_dict[hero] for hero in heroes) return Vectors.dense([1 if hero_slot in lineup else 0 for hero_slot in range(len(heroes_dict))]) heros_to_lineup_udf = udf(onehot, VectorUDT()) return df.withColumn("dire_lineup_vec", heros_to_lineup_udf(df.dire_lineup))\ .withColumn("radiant_lineup_vec", heros_to_lineup_udf(df.radiant_lineup)) df = convert_heroes_to_lineup(df)
def repeat(times, func, *args, **kwargs): for _ in range(times): yield func(*args, **kwargs) # --------- Create dataframe (from fake data) ------------- data = list(repeat(10000, fake_entry)) dataDF = spark.createDataFrame( data, ('last_name', 'first_name', 'ssn', 'occupation', 'age')) subDF = dataDF.select('last_name', 'first_name', 'ssn', 'occupation', (dataDF.age - 1).alias('age')) filteredDF = subDF.filter(subDF.age < 10) from pyspark.sql.types import BooleanType less_ten = udf(lambda s: s < 10, BooleanType()) lambdaDF = subDF.filter(less_ten(subDF.age)) lambdaDF.show() lambdaDF.count() # Let's collect the even values less than 10 even = udf(lambda s: s % 2 == 0, BooleanType()) evenDF = lambdaDF.filter(even(lambdaDF.age)) evenDF.show() evenDF.count() print("first: {0}\n".format(filteredDF.first())) print("Four of them: {0}\n".format(filteredDF.take(4))) tempDF = spark.createDataFrame([("Joe", 1), ("Joe", 1), ("Anna", 15), ("Anna", 12), ("Ravi", 5)], ('name', 'score'))
# Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. from pyspark.sql.functions import pandas_udf, PandasUDFType, col from pyspark.sql.types import BooleanType import pandas as pd from typing import Any from fink_broker.tester import spark_unit_tests @pandas_udf(BooleanType(), PandasUDFType.SCALAR) def keep_alert_based_on(nbad: Any, rb: Any, magdiff: Any) -> pd.Series: """ Experimental filtering service. For testing purposes only. Create a column whose entry is false if the alert has to be discarded, and true otherwise. Parameters ---------- nbad: Spark DataFrame Column Column containing the nbad values rb: Spark DataFrame Column Column containing the rb values magdiff: Spark DataFrame Column Column containing the magdiff values
def make_not_terminal_udf(actions: List[str]): """ Return true iff next_action is terminal (i.e. idx = len(actions)). """ def get_not_terminal(next_action): return next_action < len(actions) return udf(get_not_terminal, BooleanType())
def main(): get_unstructured_file() logger.success('Downloaded Files') schema = StructType([ StructField('Last Accessed Url', StringType(), True), StructField('Page Category', StringType(), True), StructField('Page Category 1', StringType(), True), StructField('Page Category 2', StringType(), True), StructField('Page Category 3', StringType(), True), StructField('Page Name', StringType(), True), StructField('at', StringType(), True), StructField('browser', StringType(), True), StructField('carrier', StringType(), True), StructField('city_name', StringType(), True), StructField('clv_total', LongType(), True), StructField('country', StringType(), True), StructField('custom_1', StringType(), True), StructField('custom_2', StringType(), True), StructField('custom_3', StringType(), True), StructField('custom_4', StringType(), True), StructField('device_new', BooleanType(), True), StructField('first-accessed-page', StringType(), True), StructField('install_uuid', StringType(), True), StructField('language', StringType(), True), StructField('library_ver', StringType(), True), StructField('marketing_campaign', StringType(), True), StructField('marketing_medium', StringType(), True), StructField('marketing_source', StringType(), True), StructField('model', StringType(), True), StructField('name', StringType(), True), StructField('nth', LongType(), True), StructField('os_ver', StringType(), True), StructField('platform', StringType(), True), StructField('region', StringType(), True), StructField('session_uuid', StringType(), True), StructField('studentId_clientType', StringType(), True), StructField('type', StringType(), True), StructField('user_type', StringType(), True), StructField('uuid', StringType(), True) ]) logger.debug('Creating DataFrame...') df = spark.read.schema(schema).json('*.json') logger.success('DataFrame created with {} rows'.format(df.count())) df = df.select('at', 'browser', 'country', 'custom_4', 'studentId_clientType', 'Page Name', 'Last Accessed Url') \ .filter(df['studentId_clientType'].isNotNull()) df_country = df.select(df.country) \ .filter(df.country != 'br') \ .groupBy('country').count() #df_country.repartition(1).write.format('csv').mode('overwrite').option('header', 'true').save('country') df_users = df.filter(df.custom_4.isNotNull()) \ .select(df.custom_4) \ .groupBy(df.custom_4).count() #df_users.repartition(1).write.format('csv').mode('overwrite').option('header', 'true').save('user.csv') df_result = df.withColumn('id', clean_studentId(df['studentId_clientType'])) df_result = df_result.drop('studentId_clientType') query = """SELECT fat.id, state, city, cou.name course FROM "DM_PASSEI_DIRETO".fat_students fat INNER JOIN "DM_PASSEI_DIRETO".dim_courses cou ON fat.course_id = cou.id INNER JOIN "DM_PASSEI_DIRETO".dim_sessions ds ON fat.id = ds.student_id WHERE CAST(ds.start_time as VARCHAR) LIKE '2017-11-16%'""" students = dw_get_data(query) students_schema = StructType([ StructField('id', StringType(), True), StructField('state', StringType(), True), StructField('city', StringType(), True), StructField('course', StringType(), True) ]) df_dim = spark.createDataFrame(students, students_schema) df_result = df_result.join(df_dim, 'id', how='inner').distinct() #df_result.repartition(1).write.format('csv').mode('overwrite').option('header', 'true').save('full.csv') df_country.toPandas().to_csv('country.csv') df_users.toPandas().to_csv('users.csv') df_result.toPandas().to_csv('full.csv') send_files('*.csv')
def select_relevant_columns(df, discrete_action: bool = True, include_possible_actions: bool = True): """ Select all the relevant columns and perform type conversions. """ if not discrete_action and include_possible_actions: raise NotImplementedError( "currently we don't support include_possible_actions") select_col_list = [ # pyre-fixme[16]: Module `functions` has no attribute `col`. # pyre-fixme[16]: Module `functions` has no attribute `col`. col("reward").cast(FloatType()), # pyre-fixme[16]: Module `functions` has no attribute `col`. # pyre-fixme[16]: Module `functions` has no attribute `col`. col("state_features").cast(ArrayType(FloatType())), # pyre-fixme[16]: Module `functions` has no attribute `col`. # pyre-fixme[16]: Module `functions` has no attribute `col`. col("state_features_presence").cast(ArrayType(BooleanType())), # pyre-fixme[16]: Module `functions` has no attribute `col`. # pyre-fixme[16]: Module `functions` has no attribute `col`. col("next_state_features").cast(ArrayType(FloatType())), # pyre-fixme[16]: Module `functions` has no attribute `col`. # pyre-fixme[16]: Module `functions` has no attribute `col`. col("next_state_features_presence").cast(ArrayType(BooleanType())), # pyre-fixme[16]: Module `functions` has no attribute `col`. # pyre-fixme[16]: Module `functions` has no attribute `col`. col("not_terminal").cast(BooleanType()), # pyre-fixme[16]: Module `functions` has no attribute `col`. # pyre-fixme[16]: Module `functions` has no attribute `col`. col("action_probability").cast(FloatType()), # pyre-fixme[16]: Module `functions` has no attribute `col`. # pyre-fixme[16]: Module `functions` has no attribute `col`. col("mdp_id").cast(LongType()), # pyre-fixme[16]: Module `functions` has no attribute `col`. # pyre-fixme[16]: Module `functions` has no attribute `col`. col("sequence_number").cast(LongType()), # pyre-fixme[16]: Module `functions` has no attribute `col`. # pyre-fixme[16]: Module `functions` has no attribute `col`. col("step").cast(LongType()), # pyre-fixme[16]: Module `functions` has no attribute `col`. # pyre-fixme[16]: Module `functions` has no attribute `col`. col("time_diff").cast(LongType()), # pyre-fixme[16]: Module `functions` has no attribute `col`. # pyre-fixme[16]: Module `functions` has no attribute `col`. col("metrics").cast(ArrayType(FloatType())), # pyre-fixme[16]: Module `functions` has no attribute `col`. # pyre-fixme[16]: Module `functions` has no attribute `col`. col("metrics_presence").cast(ArrayType(BooleanType())), ] if discrete_action: select_col_list += [ # pyre-fixme[16]: Module `functions` has no attribute `col`. # pyre-fixme[16]: Module `functions` has no attribute `col`. col("action").cast(LongType()), # pyre-fixme[16]: Module `functions` has no attribute `col`. # pyre-fixme[16]: Module `functions` has no attribute `col`. col("next_action").cast(LongType()), ] else: select_col_list += [ # pyre-fixme[16]: Module `functions` has no attribute `col`. # pyre-fixme[16]: Module `functions` has no attribute `col`. col("action").cast(ArrayType(FloatType())), # pyre-fixme[16]: Module `functions` has no attribute `col`. # pyre-fixme[16]: Module `functions` has no attribute `col`. col("next_action").cast(ArrayType(FloatType())), # pyre-fixme[16]: Module `functions` has no attribute `col`. # pyre-fixme[16]: Module `functions` has no attribute `col`. col("action_presence").cast(ArrayType(BooleanType())), # pyre-fixme[16]: Module `functions` has no attribute `col`. # pyre-fixme[16]: Module `functions` has no attribute `col`. col("next_action_presence").cast(ArrayType(BooleanType())), ] if include_possible_actions: select_col_list += [ # pyre-fixme[16]: Module `functions` has no attribute `col`. # pyre-fixme[16]: Module `functions` has no attribute `col`. col("possible_actions_mask").cast(ArrayType(LongType())), # pyre-fixme[16]: Module `functions` has no attribute `col`. # pyre-fixme[16]: Module `functions` has no attribute `col`. col("possible_next_actions_mask").cast(ArrayType(LongType())), ] return df.select(*select_col_list)
def test_as_spark_type_pandas_on_spark_dtype(self): type_mapper = { # binary np.character: (np.character, BinaryType()), np.bytes_: (np.bytes_, BinaryType()), np.string_: (np.bytes_, BinaryType()), bytes: (np.bytes_, BinaryType()), # integer np.int8: (np.int8, ByteType()), np.byte: (np.int8, ByteType()), np.int16: (np.int16, ShortType()), np.int32: (np.int32, IntegerType()), np.int64: (np.int64, LongType()), np.int: (np.int64, LongType()), int: (np.int64, LongType()), # floating np.float32: (np.float32, FloatType()), np.float: (np.float64, DoubleType()), np.float64: (np.float64, DoubleType()), float: (np.float64, DoubleType()), # string np.str: (np.unicode_, StringType()), np.unicode_: (np.unicode_, StringType()), str: (np.unicode_, StringType()), # bool np.bool: (np.bool, BooleanType()), bool: (np.bool, BooleanType()), # datetime np.datetime64: (np.datetime64, TimestampType()), datetime.datetime: (np.dtype("datetime64[ns]"), TimestampType()), # DateType datetime.date: (np.dtype("object"), DateType()), # DecimalType decimal.Decimal: (np.dtype("object"), DecimalType(38, 18)), # ArrayType np.ndarray: (np.dtype("object"), ArrayType(StringType())), # CategoricalDtype CategoricalDtype(categories=["a", "b", "c"]): ( CategoricalDtype(categories=["a", "b", "c"]), LongType(), ), } for numpy_or_python_type, (dtype, spark_type) in type_mapper.items(): self.assertEqual(as_spark_type(numpy_or_python_type), spark_type) self.assertEqual(pandas_on_spark_type(numpy_or_python_type), (dtype, spark_type)) if isinstance(numpy_or_python_type, CategoricalDtype): # Nested CategoricalDtype is not yet supported. continue self.assertEqual(as_spark_type(List[numpy_or_python_type]), ArrayType(spark_type)) self.assertEqual( pandas_on_spark_type(List[numpy_or_python_type]), (np.dtype("object"), ArrayType(spark_type)), ) # For NumPy typing, NumPy version should be 1.21+ and Python version should be 3.8+ if sys.version_info >= (3, 8) and LooseVersion( np.__version__) >= LooseVersion("1.21"): import numpy.typing as ntp self.assertEqual( as_spark_type(ntp.NDArray[numpy_or_python_type]), ArrayType(spark_type)) self.assertEqual( pandas_on_spark_type(ntp.NDArray[numpy_or_python_type]), (np.dtype("object"), ArrayType(spark_type)), ) with self.assertRaisesRegex(TypeError, "Type uint64 was not understood."): as_spark_type(np.dtype("uint64")) with self.assertRaisesRegex(TypeError, "Type object was not understood."): as_spark_type(np.dtype("object")) with self.assertRaisesRegex(TypeError, "Type uint64 was not understood."): pandas_on_spark_type(np.dtype("uint64")) with self.assertRaisesRegex(TypeError, "Type object was not understood."): pandas_on_spark_type(np.dtype("object"))
# STRING COMPARISON @udf(returnType=IntegerType()) def damerau_levenshtein_distance(s1, s2): return None if s1 == None or s2 == None else J.damerau_levenshtein_distance( s1, s2) @udf(returnType=IntegerType()) def hamming_distance(s1, s2): return None if s1 == None or s2 == None else J.hamming_distance(s1, s2) @udf(returnType=FloatType()) def jaro_similarity(s1, s2): return None if s1 == None or s2 == None else J.jaro_similarity(s1, s2) @udf(returnType=FloatType()) def jaro_winkler_similarity(s1, s2): return None if s1 == None or s2 == None else J.jaro_winkler_similarity( s1, s2) @udf(returnType=BooleanType()) def match_rating_comparison(s1, s2): return None if s1 == None or s2 == None else J.match_rating_comparison( s1, s2)
from pyspark.sql.types import ( DoubleType, LongType, StringType, StructField, StructType, BooleanType, ) retention_schema = StructType([ StructField("client_id", StringType(), True), StructField("subsession_start", StringType(), True), StructField("profile_creation", StringType(), True), StructField("days_since_creation", LongType(), True), StructField("channel", StringType(), True), StructField("app_version", StringType(), True), StructField("geo", StringType(), True), StructField("distribution_id", StringType(), True), StructField("is_funnelcake", BooleanType(), True), StructField("source", StringType(), True), StructField("medium", StringType(), True), StructField("campaign", StringType(), True), StructField("content", StringType(), True), StructField("sync_usage", StringType(), True), StructField("is_active", BooleanType(), True), StructField("usage_hours", DoubleType(), True), StructField("sum_squared_usage_hours", DoubleType(), True), StructField("total_uri_count", LongType(), True), StructField("unique_domains_count", LongType(), True), ])
cultures = ['japanese', 'american', 'african (general)'] # Uncomment the above for more robust and large scale searches! classes = cultures + mediums medium_set = set(mediums) culture_set = set(cultures) selected_ids = {"AK-RBK-17525-2", "AK-MAK-1204", "AK-RAK-2015-2-9"} small_df = df.where( udf( lambda medium, culture, id_val: (medium in medium_set) or (culture in culture_set) or (id_val in selected_ids), BooleanType())("Classification", "Culture", "id")) small_df.count() # COMMAND ---------- # MAGIC %md ### Define and fit ConditionalKNN models # MAGIC Below, we create ConditionalKNN models for both the medium and culture columns; each model takes in an output column, features column (feature vector), values column (cell values under the output column), and label column (the quality that the respective KNN is conditioned on). # COMMAND ---------- medium_cknn = ( ConditionalKNN().setOutputCol("Matches").setFeaturesCol("Norm_Features"). setValuesCol("Thumbnail_Url").setLabelCol("Classification").fit(small_df)) # COMMAND ----------
def transactions_feature_table(spark, client): schema = StructType([ StructField("customer_id", IntegerType()), StructField("event_timestamp", TimestampType()), StructField("created_timestamp", TimestampType()), StructField("total_transactions", DoubleType()), StructField("is_vip", BooleanType()), ]) df_data = [ ( 1001, datetime(year=2020, month=9, day=1, tzinfo=utc), datetime(year=2020, month=9, day=1, tzinfo=utc), 50.0, True, ), ( 1001, datetime(year=2020, month=9, day=1, tzinfo=utc), datetime(year=2020, month=9, day=2, tzinfo=utc), 100.0, True, ), ( 2001, datetime(year=2020, month=9, day=1, tzinfo=utc), datetime(year=2020, month=9, day=1, tzinfo=utc), 400.0, False, ), ( 1001, datetime(year=2020, month=9, day=2, tzinfo=utc), datetime(year=2020, month=9, day=1, tzinfo=utc), 200.0, False, ), ( 1001, datetime(year=2020, month=9, day=4, tzinfo=utc), datetime(year=2020, month=9, day=1, tzinfo=utc), 300.0, False, ), ] temp_dir, file_uri = create_temp_parquet_file(spark, "transactions", schema, df_data) file_source = FileSource( event_timestamp_column="event_timestamp", created_timestamp_column="created_timestamp", file_format=ParquetFormat(), file_url=file_uri, ) features = [ Feature("total_transactions", ValueType.DOUBLE), Feature("is_vip", ValueType.BOOL), ] feature_table = FeatureTable("transactions", ["customer_id"], features, batch_source=file_source) yield client.apply(feature_table) shutil.rmtree(temp_dir)
StructField( "build", ArrayType( StructType([StructField("application_name", StringType(), True)]), True), True), StructField( "settings", ArrayType(StructType([StructField("locale", StringType(), True)]), True), True), StructField( "active_addons", ArrayType( MapType( StringType(), StructType([ StructField("blocklisted", BooleanType(), True), StructField("type", StringType(), True), StructField("signed_state", LongType(), True), StructField("user_disabled", BooleanType(), True), StructField("app_disabled", BooleanType(), True), StructField("is_system", BooleanType(), True) ]), True), True)) ]) default_sample = { "client_id": "client-id", "normalized_channel": "release", "build": [{ "application_name": "Firefox"
def helper_method_process_data(total_data, spark_context, sql_context): """ Process the raw input data This function parallelize and filter raw input data to be ready for model building. Keyword arguments: total_data -- raw input data spark_context -- Spark context created in caller module sql_context -- SQL/HIVE Context created in caller module """ def table_creator(row): ''' This method will give minimal structure to data asper requirement. ''' temp_file_name = str(row['ci_job']['artifacts'].get('name', '')) return (Row( original_url=str(row['ci_job']['artifacts'].get( 'original_url', '')), result=str(row['ci_job'].get('result', '')), message=row.get('message', ''), file_name='XML' if temp_file_name.endswith('.xml') else temp_file_name, )) def datetime_substitutor(a): return re.sub('\d{4}-\d{2}-\d{2}\s+\d{2}:\d{2}:\d{2}', '', a) def space_remover(a): return re.sub('\s+', ' ', a) def caller_method(a): return space_remover(datetime_substitutor(a)) print "===============One data point=============:" print "inside helper_method_process_data KEYS:", total_data[0].keys() print "After filtering out the data without _source, length of data:", len( total_data) total_df = spark_context.parallelize(total_data).map(table_creator) total_df = total_df.filter( lambda a: a.original_url.endswith('xunit.xml') == False) total_df = sql_context.createDataFrame(data=total_df, samplingRatio=0.3) udf_caller_method = udf(caller_method) #msg_df_failed = total_df.rdd.filter(lambda a:a.result != 'SUCCESS').toDF(sampleRatio=0.3).select('message','original_url') msg_df_failed = total_df.select('message', 'original_url') msg_df_failed = msg_df_failed.withColumn( "cleaned_message", udf_caller_method(msg_df_failed.message)) msg_df_failed.first() #regex = re.compile('[%s]' % re.escape(string.punctuation)) color_code_words = ['[1;32m', '[0m'] stopwords = ['a', 'about', 'above', 'across', 'after', 'afterwards'] stopwords += ['again', 'against', 'all', 'almost', 'alone', 'along'] stopwords += ['already', 'also', 'although', 'always', 'am', 'among'] stopwords += ['amongst', 'amoungst', 'amount', 'an', 'and', 'another'] stopwords += ['any', 'anyhow', 'anyone', 'anything', 'anyway', 'anywhere'] stopwords += ['are', 'around', 'as', 'at', 'back', 'be', 'became'] stopwords += ['because', 'become', 'becomes', 'becoming', 'been'] stopwords += ['before', 'beforehand', 'behind', 'being', 'below'] stopwords += ['beside', 'besides', 'between', 'beyond', 'bill', 'both'] stopwords += ['bottom', 'but', 'by', 'call', 'can', 'cannot', 'cant'] stopwords += ['co', 'computer', 'con', 'could', 'couldnt', 'cry', 'de'] stopwords += ['describe', 'detail', 'did', 'do', 'done', 'down', 'due'] stopwords += ['during', 'each', 'eg', 'eight', 'either', 'eleven', 'else'] stopwords += ['elsewhere', 'empty', 'enough', 'etc', 'even', 'ever'] stopwords += ['every', 'everyone', 'everything', 'everywhere', 'except'] stopwords += [ 'false', 'few', 'fifteen', 'fifty', 'file', 'fill', 'find', 'fire', 'first' ] stopwords += ['five', 'for', 'former', 'formerly', 'forty', 'found'] stopwords += ['four', 'from', 'front', 'full', 'further', 'get', 'give'] stopwords += ['go', 'had', 'has', 'hasnt', 'have', 'he', 'hence', 'her'] stopwords += ['here', 'hereafter', 'hereby', 'herein', 'hereupon', 'hers'] stopwords += ['herself', 'him', 'himself', 'his', 'how', 'however'] stopwords += ['hundred', 'i', 'ie', 'if', 'in', 'inc', 'indeed'] stopwords += ['interest', 'into', 'is', 'it', 'its', 'itself', 'keep'] stopwords += [ 'last', 'latter', 'latterly', 'least', 'less', 'line', 'ltd', 'made' ] stopwords += ['many', 'may', 'me', 'meanwhile', 'might', 'mill', 'mine'] stopwords += ['more', 'moreover', 'most', 'mostly', 'move', 'much'] stopwords += ['must', 'my', 'myself', 'name', 'namely', 'neither', 'never'] stopwords += ['nevertheless', 'next', 'nine', 'no', 'nobody', 'none'] stopwords += ['noone', 'nor', 'not', 'nothing', 'now', 'nowhere', 'of'] stopwords += ['off', 'often', 'on', 'once', 'one', 'only', 'onto', 'or'] stopwords += ['other', 'others', 'otherwise', 'our', 'ours', 'ourselves'] stopwords += ['out', 'over', 'own', 'part', 'per', 'perhaps', 'please'] stopwords += ['put', 'rather', 're', 's', 'same', 'see', 'seem', 'seemed'] stopwords += ['seeming', 'seems', 'serious', 'several', 'she', 'should'] stopwords += ['show', 'side', 'since', 'sincere', 'six', 'sixty', 'so'] stopwords += ['some', 'somehow', 'someone', 'something', 'sometime'] stopwords += ['sometimes', 'somewhere', 'still', 'such', 'system', 'take'] stopwords += ['ten', 'than', 'that', 'the', 'their', 'them', 'themselves'] stopwords += ['then', 'thence', 'there', 'thereafter', 'thereby'] stopwords += ['therefore', 'therein', 'thereupon', 'these', 'they'] stopwords += ['thick', 'thin', 'third', 'this', 'those', 'though', 'three'] stopwords += ['three', 'through', 'throughout', 'thru', 'thus', 'to'] stopwords += [ 'together', 'too', 'top', 'toward', 'towards', 'true', 'twelve' ] stopwords += ['twenty', 'two', 'un', 'under', 'until', 'up', 'upon'] stopwords += ['us', 'very', 'via', 'was', 'we', 'well', 'were', 'what'] stopwords += ['whatever', 'when', 'whence', 'whenever', 'where'] stopwords += ['whereafter', 'whereas', 'whereby', 'wherein', 'whereupon'] stopwords += ['wherever', 'whether', 'which', 'while', 'whither', 'who'] stopwords += ['whoever', 'whole', 'whom', 'whose', 'why', 'will', 'with'] stopwords += ['within', 'without', 'would', 'yet', 'you', 'your'] stopwords += ['yours', 'yourself', 'yourselves'] stopwords += color_code_words stopwords = set(stopwords) broadcast_var = spark_context.broadcast(stopwords) total_df.unpersist() del total_df ## Notes: Map Reduce was replaced by Aggregation of Spark Framework. temp_df = msg_df_failed.filter( udf(lambda x: x.strip() != '', BooleanType())( msg_df_failed.cleaned_message)).groupby('original_url').agg( func.concat_ws( " ", func.collect_list( msg_df_failed.cleaned_message))).withColumnRenamed( "original_url", "artifact_url").withColumnRenamed( "concat_ws( , collect_list(cleaned_message))", "concat_msg") print "=================== NLTK version Print=====================" print nltk.__version__ def word_tokenize(x): return nltk.word_tokenize(x) df = (temp_df.rdd.map(lambda x: ( x.artifact_url, word_tokenize(x.concat_msg))).toDF().withColumnRenamed( "_1", "artifact_url")).withColumnRenamed("_2", "features") temp_df.unpersist() msg_df_failed.unpersist() del temp_df del msg_df_failed df = df.rdd.map(lambda a: Row( artifact_url=a.artifact_url, features=filter(lambda y: len(y) > 3, [x.lower() for x in a.features]) )).map( lambda a: Row(artifact_url=a.artifact_url, features=filter(lambda x: (x not in broadcast_var.value), a.features))).toDF() return df