def df_erros_dia(_logs): return (_logs.filter(f.col('codigo_retorno_http') == 404).groupby( f.substring('timestamp', 1, 11).alias('timestamp')).agg( f.count(f.col('codigo_retorno_http')).alias( 'total_erros_dia')).orderBy(f.unix_timestamp( f.substring('timestamp', 1, 11), 'dd/MMM/yyyy'), ascending=False))
def main(): args = parseArguments() spark = SparkSession.builder.getOrCreate() Logger = spark._jvm.org.apache.log4j.Logger joblogger = Logger.getLogger(__name__) joblogger.info( '****************************************************************') joblogger.info('') joblogger.info('Starting creation of test data file with {0} rows and {1} ' 'partitions at {2}'.format(args.rows, args.partitions, args.outfile)) joblogger.info('') joblogger.info( '****************************************************************') udfGetUUID = F.udf(getUUID, T.StringType()) df = (spark.range(0, args.rows, numPartitions=args.partitions).withColumn( 'value', udfGetUUID()).withColumn( 'prefix2', F.substring(F.col('value'), 1, 2)).withColumn( 'prefix4', F.substring(F.col('value'), 1, 4)).withColumn( 'prefix8', F.substring(F.col('value'), 1, 8)).withColumn( 'float_val', F.rand(seed=8675309) * 1000000).withColumn( 'integer_val', F.col('float_val').cast(T.LongType())).drop('id')) df.write.csv(args.outfile, mode='overwrite', header=True) joblogger.info('Done writing to {0}'.format(args.outfile))
def hashstr(self, method='crc32', salt=''): f = { 'crc32': F.crc32, 'md5': F.md5, 'md5-8': F.md5, 'md5-4': F.md5, 'sha1': F.sha1 }.get(method, F.crc32) df = self.df for c in self.scols: col = F.col(c).cast('string') h = f(F.concat(col, F.lit(salt))) if method=='crc32': res = F.conv(h.cast('string'), 10, 16) elif method=='md5-8': res = F.substring(h, 0, 16) elif method=='md5-4': res = F.substring(h, 0, 8) else: res = h df = df.withColumn(c, res) return df
def extract_imits_tsv_allele_2(spark_session: SparkSession, file_path: str) -> DataFrame: imits_df = utils.extract_tsv(spark_session, file_path) imits_df = imits_df.withColumn( "allele_mgi_accession_id", when( (col("allele_mgi_accession_id").isNull()) & (col("type") == "Allele"), concat(lit("NOT-RELEASED-"), substring(md5(col("allele_symbol")), 0, 10)), ).otherwise(col("allele_mgi_accession_id")), ) imits_df = imits_df.withColumn( "marker_mgi_accession_id", when( (col("marker_mgi_accession_id").isNull()) & (col("type") == "Gene"), concat(lit("NOT-RELEASED-"), substring(md5(col("marker_symbol")), 0, 10)), ).otherwise(col("marker_mgi_accession_id")), ) imits_df = imits_df.withColumn( "allele2_id", monotonically_increasing_id().astype(StringType())) for col_name in ALLELE2_MULTIVALUED: imits_df = imits_df.withColumn( col_name, when( col(col_name).contains("|"), split(col_name, r"\|"), ).otherwise(array(col_name)), ) return imits_df
def split_colstxt(df, cols): """Function that splits fixed-size columns and casts to corresponding data type :param df: Spark data frame containing the texts to split :param cols: column schemes with name, data type and length :return: new Spark data frame with the columns divided according to the 'cols' parameter """ i = 1 for c in cols: if (c.get('Datatype') != 'timestamp'): df = df.withColumn( c.get('name'), fspark.substring(df.data_column, i, c.get('lenght')).cast(c.get('Datatype'))) else: df = df.withColumn( c.get('name'), fspark.to_timestamp( fspark.substring(df.data_column, i, c.get('lenght')), "dd/MM/yyyy HH:mm:ss")) i = i + c.get('lenght') for c in df.dtypes: if ((c[1]) == "string"): df = df.withColumn(c[0], fspark.trim(df[c[0]])) return df
def format_output(df): df = df.withColumn("uniqueKey", f.upper(f.concat(f.lit("RY"), f.substring(f.col('year'), 3, 2), f.lit("_"), f.col("channel"), f.lit("_"), f.col("division"), f.lit("_"), f.col("gender"), f.lit("_"), f.col("category"), ))) \ .withColumn("channel", f.upper(f.col("channel"))) \ .withColumn("year", f.concat(f.lit("RY"), f.substring(f.col('year'), 3, 2))) \ .withColumn("week_1", f.concat(f.lit("W"), f.col("week"))) output = df.orderBy("week").groupBy('uniqueKey', 'division', 'gender', 'category', 'channel', 'year').agg( f.to_json( f.collect_list( f.create_map('week_1', 'netSales') ) ).alias('Net Sales'), f.to_json( f.collect_list( f.create_map('week_1', 'salesUnits') ) ).alias('Sales Units') ) return output
def filter_df(spark, path): merged = spark.read.parquet(join(path, "train/merged_transformed/")) em = spark.read.parquet(join(path, "train/EM_transformed/")) merged_date = merged.withColumn( "year", F.substring(merged.articleUID, 1, 4).cast("int")).withColumn( "month", F.substring(merged.articleUID, 5, 2).cast("int")) merged_filtered = merged_date.filter( "year > 2014 OR (year = 2014 AND month >= 6)").drop("year", "month") em_date = em.withColumn("year", F.substring(em.articleUID, 1, 4).cast("int")).withColumn( "month", F.substring(em.articleUID, 5, 2).cast("int")) em_filtered = em_date.filter( "year > 2014 OR (year = 2014 AND month >= 6)").drop("year", "month") merge_rand = merged_filtered.withColumn("rand", F.rand(seed=SEED)) em_rand = em_filtered.withColumn("rand", F.rand(seed=SEED)) merge_nambiguous = merge_rand.filter(~merge_rand.ambiguous) em_nambiguous = em_rand.filter(~em_rand.ambiguous) merge_ambiguous = merge_rand.filter(merge_rand.ambiguous) em_ambiguous = em_rand.filter(em_rand.ambiguous) return merge_nambiguous, em_nambiguous, merge_ambiguous, em_ambiguous
def df_logs_layout(_logs): return (_logs.withColumn( 'host_requisicao', f.split(_logs['value'], ' ').getItem(0)).withColumn( 'timestamp', f.concat( f.substring(f.split(_logs['value'], ' ').getItem(3), 2, 20), f.substring(f.split(_logs['value'], ' ').getItem(4), 1, 5))).withColumn( 'requisicao', f.concat( f.substring( f.split(_logs['value'], ' ').getItem(5), 2, 3), f.split(_logs['value'], ' ').getItem(6), f.substring( f.split(_logs['value'], ' ').getItem(7), 1, 8))). withColumn( 'codigo_retorno_http', f.split(_logs['value'], ' ').getItem(8).cast(IntegerType())).withColumn( 'total_bytes', f.split(_logs['value'], ' ').getItem(9).cast( IntegerType())).drop('value'))
def main(): try: with open("airly_param.json", "r") as file: param_dict = json.load(file) print(param_dict["broker"]) except Exception as ex: print("Problem z odczytem pliku parametrow: airly_param.json") print(str(ex)) exit() v_broker = param_dict["broker"] v_s3_bucket = param_dict["s3_bucket"] spark=SparkSession.builder.appName("Structured").getOrCreate() raw=spark.readStream.format("kafka")\ .option("kafka.bootstrap.servers",v_broker)\ .option("startingOffsets", "earliest")\ .option("subscribe","sensor").load() # Schemat napływających danych schema = StructType()\ .add("current", StructType()\ .add("fromDateTime", StringType())\ .add("indexes", ArrayType(StructType().add("advice",StringType()).add("color",StringType()).add("description",StringType()).add("description",StringType()).add("name",StringType()).add("value",DoubleType())))\ .add("standards", ArrayType(StructType().add("averaging",StringType()).add("limit",StringType()).add("name",StringType()).add("percent",DoubleType()).add("pollutant",StringType())))\ .add("tillDateTime", StringType())\ .add("values", ArrayType(StructType().add("name",StringType()).add("value",DoubleType()))) )\ .add("forecast", ArrayType(StructType().add("fromDateTime",StringType())\ .add("indexes",ArrayType(StructType().add("advice",StringType()).add("advice",StringType()).add("color",StringType()).add("description",StringType()).add("value",DoubleType())) ) \ .add("standarts",ArrayType(StructType().add("averaging",StringType()).add("limit",StringType()).add("name",StringType()).add("percent",DoubleType()).add("pollutant",StringType()))) \ .add("tillDateTime", StringType())\ .add("values", ArrayType(StructType().add("name",StringType()).add("value",StringType())))\ ))\ .add("history", ArrayType(StructType().add("fromDateTime",StringType())\ .add("indexes",ArrayType(StructType().add("advice",StringType()).add("advice",StringType()).add("color",StringType()).add("description",StringType()).add("value",DoubleType())) ) \ .add("standarts",ArrayType(StructType().add("averaging",StringType()).add("limit",StringType()).add("name",StringType()).add("percent",DoubleType()).add("pollutant",StringType()))) \ .add("tillDateTime", StringType())\ .add("values", ArrayType(StructType().add("name",StringType()).add("value",StringType())))\ )) # Data frame ze zparsowanymi danymi df = raw.select(f.col("key").cast("String").cast("Integer"),f.from_json(f.col("value").cast("string"), schema).alias("parsed_value")) stream = df.select(f.col("key").cast("integer"), f.concat(f.split(f.col('parsed_value.current.fromDateTime'),'T')[0],f.lit(' '),f.substring(f.split(f.col('parsed_value.current.fromDateTime'),'T')[1],0,8)).alias("fromTime"), f.concat(f.split(f.col('parsed_value.current.tillDateTime'),'T')[0],f.lit(' '),f.substring(f.split(f.col('parsed_value.current.tillDateTime'),'T')[1],0,8)).alias("untilTime"), f.explode("parsed_value.current.values").alias("value")).select("key","fromTime","untilTime","value.name",f.col("value.value").cast("float")) # zapis w mikrobatchach def write_s3(batch_df, batch_id): batch_df.write.format('json').mode("append").option("checkpointLocation",v_ckpt_loc)\ .option("path",v_s3_bucket).save() stream.writeStream.foreachBatch(write_s3).start().awaitTermination()
def main(in_directory, out_directory): df = spark.read.csv(in_directory, schema=comments_schema, sep=' ').withColumn('filename', functions.input_file_name()) path_to_hour = functions.udf(getFileName, returnType=types.StringType()) df = df.select(df['language'], df['title'], df['number_of_requests'], path_to_hour(df['filename']).alias('filename')) df = df.filter((df['language'] == 'en') & (df['title'] != 'Main Page') & (functions.substring(df['title'], 0, 8) != 'Special:')) grouped = df.groupBy('filename', 'title') count = grouped.agg(functions.sum(df['number_of_requests']).alias('count')) grouped2 = count.groupBy('filename') maxi = grouped2.agg(functions.max(count['count'])) joined = count.join(maxi, on='filename') joined = joined.filter(joined['count'] == joined['max(count)'])\ .select(joined['filename'], joined['title'], joined['count']) joined = joined.cache() sorted_ = joined.sort('filename', 'title') sorted_.write.csv(out_directory, mode='overwrite')
def map_line_columns(line_df: DataFrame): for field, value in Constants.LINE_TO_OBSERVATION_MAP.items(): if value is not None: line_df = line_df.withColumn(field, col(value)) else: line_df = line_df.withColumn(field, lit(None)) line_df = line_df.withColumn("biological_sample_group", lit("experimental")) line_df = line_df.withColumn("zygosity", lit("homozygote")) line_df = line_df.withColumn( "datasource_name", when(col("_dataSource") == "impc", lit("IMPC")).otherwise( when(col("_dataSource") == "europhenome", lit("EuroPhenome")).otherwise(col("_dataSource"))), ) line_df = line_df.withColumn( "allele_accession_id", when(col("biological_sample_group") == "control", lit(None)).otherwise( when( col("allele.mgiAlleleID").isNull(), concat( lit("NOT-RELEASED-"), substring(md5(line_df["allele_symbol"]), 0, 10), ), ).otherwise(col("allele.mgiAlleleID"))), ) return line_df
def get_column_spec(self, source_df: Optional[DataFrame], current_column: Optional[Column]) -> Column: column_spec = substring( self.column.get_column_spec(source_df=source_df, current_column=current_column), self.start, self.length) return column_spec
def _clean_airports(self): """ Class method to clean airports data Operations: - select only US airports - select only airports where type is in ("large_airport", "medium_airport", "small_airport") - isolate region substring from iso_region field - cast elevation in feet to float Returns: [dict] - object with source-name: SparkDF key-value pairs """ df = self.data_dict.get('airports', None) if df is not None: data = df\ .where( (col("iso_country") == "US") & (col("type").isin("large_airport", "medium_airport", "small_airport")) )\ .withColumn("iso_region", substring(col("iso_region"), 4, 2))\ .withColumn("elevation_ft", col("elevation_ft").cast("float"))\ .withColumnRenamed("local_code", "port_code") return dict(airports=data) else: logger.error( ValueError( 'No dataset named "airports" found in sources dict.')) raise ValueError( 'No dataset named "airports" found in sources dict.')
def repartitionData(self, data): """ Perform table repartitioning in spark depending on settings in config :param data: :return: """ print( f"{self.__class__.__name__} received microdata dataframe with number of partitions: {data.rdd.getNumPartitions()}" ) print(f"& schema: {data.printSchema()}") print( f"{self.__class__.__name__} is repartitioning microdata dataframe to {self.reader.num_reader_partitions} partitions, " ) print( "collapsing to a sparse histogram representation, and passing back to driver..." ) if self.reader.num_reader_partitions > 0: data = data.withColumn( self.PARTITION_CODE_COLUMN, substring(self.geography_variables[0], 0, self.reader.reader_partition_len)) if not self.reader.range_partition: print( f"Using df hash partitioner by {self.PARTITION_CODE_COLUMN}" ) return data.repartition(self.reader.num_reader_partitions, self.PARTITION_CODE_COLUMN).drop( self.PARTITION_CODE_COLUMN) print( f"Using df range partitioner by {self.PARTITION_CODE_COLUMN}") return data.repartitionByRange(self.reader.num_reader_partitions, self.PARTITION_CODE_COLUMN).drop( self.PARTITION_CODE_COLUMN) return data
def do(self, workflow, etl_process): from pyspark.sql.functions import substring, substring_index, split, col self.new_column = self.action_details.pop("name") self.target = self.action_details.pop("target") self.type = self.action_details.pop("type", "simple") if self.type == "simple": self.pos = self.action_details.pop("pos", 1) self.len = self.action_details.pop("len") workflow.df = workflow.df \ .withColumn(self.new_column, substring(col(self.target), self.pos, self.len)) else: self.delim = self.action_details.pop("delim") self.index = self.action_details.pop("index", 1) if self.type == "delim": workflow.df = workflow.df \ .withColumn(self.new_column, substring_index(col(self.target), self.delim, self.index)) elif self.type == "delim_index": workflow.df = workflow.df \ .withColumn(self.new_column, split(self.target, self.delim).getItem(self.index - 1))
def fill_days(year: str, month: str, day: str) -> None: S3_FULL_PATH = f"{S3_PATH}/{year}/{month}/{day}" try: # read all parquet files from specific yyyy/mm/dd df = spark.read.format("parquet").load(f"{S3_FULL_PATH}/*.parquet") # creates extracted_at column from date path df = df.withColumn( "extracted_at", from_unixtime( # covert unixtime to timestamp unix_timestamp( substring( regexp_replace(input_file_name(), f"{S3_PATH}/", ""), 1, 10 ), "yyyy/MM/dd", ) ), ) # overwrite dataframe with the new column df.write.mode("overwrite").parquet(f"{S3_FULL_PATH}/") print(f"files updated from {S3_FULL_PATH}/") # some dates don't have paths. except AnalysisException as err: print(err)
def billing_events(df): import datetime MAX_MONTH = 72 def get_last_month(col): h = F.abs(F.xxhash64(col)) h1 = (h.bitwiseAND(0xff)) % (MAX_MONTH // 2) h2 = (F.shiftRight(h, 8).bitwiseAND(0xff)) % (MAX_MONTH // 3) h3 = (F.shiftRight(h, 16).bitwiseAND(0xff)) % (MAX_MONTH // 5) h4 = (F.shiftRight(h, 24).bitwiseAND(0xff)) % (MAX_MONTH // 7) h5 = (F.shiftRight(h, 32).bitwiseAND(0xff)) % (MAX_MONTH // 11) return -(h1 + h2 + h3 + h4 + h5) w = pyspark.sql.Window.orderBy(F.lit("")).partitionBy(df.customerID) charges = (df.select( df.customerID, F.lit("Charge").alias("kind"), F.explode( F.array_repeat( (df.TotalCharges / df.tenure).cast(get_currency_type()), df.tenure.cast("int"))).alias("value"), F.when(df.Churn == "Yes", get_last_month( df.customerID)).otherwise(0).alias("last_month")).withColumn( "now", F.lit(now).cast("date")).withColumn( "month_number", -(F.row_number().over(w) + F.col("last_month"))).withColumn( "date", F.expr("add_months(now, month_number)")).drop( "now", "month_number", "last_month")) serviceStarts = (df.withColumn( "last_month", F.when(df.Churn == "Yes", get_last_month( df.customerID)).otherwise(0)).select( df.customerID, F.lit("AccountCreation").alias("kind"), F.lit(0.0).cast(get_currency_type()).alias("value"), F.lit(now).alias("now"), (-df.tenure - 1 + F.col("last_month")).alias("month_number"), ).withColumn("date", F.expr("add_months(now, month_number)")).drop( "now", "month_number")) serviceTerminations = df.withColumn( "last_month", F.when(df.Churn == "Yes", get_last_month(df.customerID)).otherwise(0)).where( df.Churn == "Yes").withColumn("now", F.lit(now)).select( df.customerID, F.lit("AccountTermination").alias("kind"), F.lit(0.0).cast(get_currency_type()).alias("value"), F.expr("add_months(now, last_month)").alias("date")) billingEvents = charges.union(serviceStarts).union( serviceTerminations).orderBy("date").withColumn( "month", F.substring("date", 0, 7)) return billingEvents
def extract_imits_tsv_by_entity_type(spark_session: SparkSession, file_path: str, entity_type: str) -> DataFrame: """ Uses a Spark Session to generate a DataFrame from a TSV file and a specific entity type. Can extract Genes or Alleles from a Alleles report file produced by IMITS. :param spark_session: spark SQL session to be used in the extraction :param file_path: path to the TSV file :param entity_type: 'Allele' or 'Gene' :return: Spark DataFrame with the extracted data """ imits_df = utils.extract_tsv(spark_session, file_path) imtis_entity_df = imits_df.where(imits_df.type == entity_type) if entity_type == "Allele": imtis_entity_df = imtis_entity_df.withColumn( "acc", when( col("allele_mgi_accession_id").isNull(), concat(lit("NOT-RELEASED-"), substring(md5(col("allele_symbol")), 0, 10)), ).otherwise(col("allele_mgi_accession_id")), ) imtis_entity_df = imtis_entity_df.withColumn( "allele2_id", monotonically_increasing_id().astype(StringType())) for col_name in ALLELE2_MULTIVALUED: imits_df = imits_df.withColumn( col_name, when( col(col_name).contains("|"), split(col_name, r"\|"), ).otherwise(array(col_name)), ) return imtis_entity_df
def read_and_process_airport_data(spark, filename, df_dimension_state_table): """ Load the airport codes join with state dimension data to get airports with state_key""" logging.info("Reading airport data") # load the airport codes so we can map them to states airport_schema = R([ Fld("ident", Str()), Fld("type", Str()), Fld("name", Str()), Fld("elevation_ft", Int()), Fld("continent", Str()), Fld("iso_country", Str()), Fld("iso_region", Str()), Fld("municipality", Str()), Fld("gps_code", Str()), Fld("iata_code", Str()), Fld("local_code", Str()), Fld("coordinates", Str()) ]) df_airport = spark.read.options(Header=True, Delimter=",").csv(filename, airport_schema) # cleanse: we only want the airports in the US which map to the states that we have in the states table df_airport = df_airport.filter(df_airport.iso_country == "US") \ .join(df_dimension_state_table, F.substring(df_airport.iso_region, 4, 2) == df_dimension_state_table.state_key, "inner") \ .select(df_airport.ident, df_airport.local_code, df_dimension_state_table.state_key) return df_airport
def read_parquet(self,inputfilepath): out_df=self.spark.read.parquet('file:///'+inputfilepath+'/'+'out_parquet') ldf2=out_df. \ orderBy(out_df['ScreenTemperature'].desc()). \ limit(1). \ select(out_df['ForecastSiteCode'], substring(out_df['ObservationDate'],1,10).alias('Date'),out_df['Region'],out_df['ScreenTemperature'] ). \ show(10,truncate=False)
def erro_404_dia(df): df1 = (df.filter(df['retorno_http'] == '404')) df2 = (df1.select( f.substring(df1['date'], 2, 11).alias('date'), df1['retorno_http']).groupBy('date').agg( f.count('retorno_http').alias('tot_erro_404')).sort( f.col('date').asc())) return df2
def extract_hes_year(hes_type_regex): table_yyYY_col = F.regexp_extract(F.col('tableName'), "(?<=" + hes_type_regex + "_{0,1})\d{4}", 0) table_yy_col = F.substring(table_yyYY_col, 0, 2) table_year_col = (F.when(table_yy_col > 80, F.concat(F.lit('19'), table_yy_col)) .otherwise(F.concat(F.lit('20'), table_yy_col).cast('integer')) ) return table_year_col
def make_wikidata_movies(): wd = spark.read.parquet(sys.argv[1]) # just movies wd = wd.where(wd['imdb_id'].isNotNull()).where( functions.substring(wd['imdb_id'].getItem(0), 0, 2) == 'tt') # ... with reasonable depth of data wd = wd.where(wd['genre'].isNotNull()) wd = wd.where(wd['enwiki_title'].isNotNull()) wd = wd.where(wd['rotten_tomatoes_id'].isNotNull()) # calculate made_profit boolean wd = wd.withColumn( 'nbox', wd.box_office.getItem(0).getItem(0).cast(types.FloatType())) wd = wd.withColumn('ncost', wd.cost.getItem(0).getItem(0).cast(types.FloatType())) wd = wd.withColumn('made_profit', (wd.nbox - wd.ncost) > 0) wd = wd.withColumn('profit', wd.nbox - wd.ncost) # actual fields we want to output: other fields could be included and/or modified here. output_data = wd.select( wd['id'].alias('wikidata_id'), wd['label'], take_first(wd, 'imdb_id'), take_first(wd, 'rotten_tomatoes_id'), take_first(wd, 'metacritic_id'), wd['enwiki_title'], wd['genre'], wd['main_subject'], wd['filming_location'], wd['director'], #wd['screenwriter'], wd['cast_member'], #wd['narrative_location'], #wd['director_of_photography'], #wd['film_editor'], #wd['filming_location'], take_first(wd, 'series'), #wd['voice_actor'], #wd['executive_producer'], #wd['composer'], #take_first(wd, 'production_company'), #take_first(wd, 'distributor'), first_publication_date(wd['publication_date'] ).alias('publication_date'), take_first(wd, 'based_on'), take_first(wd, 'country_of_origin'), take_first(wd, 'original_language'), wd['made_profit'], wd['profit'], wd['nbox'], wd['ncost']) # output is about 4MB compressed: safe to .coalesce(). output_data.coalesce(1).write.json('./wikidata-movies', mode='overwrite', compression='gzip')
def mturkify( spark: SparkSession, *, quotes_context_path: str, quootstrap_path: str, speakers_path: str, output_path: str, nb_partition: int, compression: str = "gzip", ): qc = spark.read.json(quotes_context_path) qc_date = qc.withColumn("year", F.substring(qc.articleUID, 1, 4).cast("int")).withColumn( "month", F.substring(qc.articleUID, 5, 2).cast("int")) qc_filtered = qc_date.filter("year > 2014 OR (year = 2014 AND month >= 6)") qc_key = qc_filtered.withColumn( "key", F.substring(qc_filtered.articleUID, 1, 6).cast("int")).drop("year", "month") quootstrap_df = spark.read.json(quootstrap_path) q2 = quootstrap_df.select(F.explode("occurrences").alias("occurrence")) fields_to_keep = [ q2.occurrence.articleUID.alias("articleUID"), q2.occurrence.articleOffset.alias("articleOffset"), ] attributed_quotes_df = q2.select(*fields_to_keep).withColumn( "in_quootstrap", F.lit(True)) speakers = spark.read.json(speakers_path) joined = qc_key.join(speakers, on="articleUID") transformed = (joined.rdd.map(transform).filter( lambda x: x is not None).toDF().withColumn("nb_entities", F.size("entities"))) transformed_quootstrap = transformed.join( attributed_quotes_df, on=["articleUID", "articleOffset"], how="left").na.fill(False) transformed_quootstrap.write.parquet(output_path, "overwrite", compression=compression)
def loadDiscovery(self): self.log.info('Exception Handling starts') validSourceFormat = self.isValidFormatInSource() if not validSourceFormat: self.log.error( "StoreDailyGoalForecast Source files not in csv format.") validSourceSchema = self.isValidSchemaInSource() if not validSourceSchema: self.log.error( "StoreDailyGoalForecast Source schema does not have all the required columns." ) if not validSourceFormat or not validSourceSchema: self.log.info( "Copy the source files to data processing error path and return." ) self.copyFile( self.storeDailyGoalForecastFile, self.dataProcessingErrorPath + self.tableName + datetime.now().strftime('%Y%m%d%H%M') + self.fileFormat) return self.log.info('Source format and schema validation successful.') self.log.info('Reading the input parquet file') dfStoreDailyGoalForecast = self.sparkSession.read.format("com.databricks.spark.csv"). \ option("encoding", "UTF-8"). \ option("ignoreLeadingWhiteSpace", "true"). \ option("ignoreTrailingWhiteSpace", "true"). \ option("header", "true"). \ option("treatEmptyValuesAsNulls", "true"). \ option("inferSchema", "true"). \ option("escape", '"'). \ option("quote", "\""). \ option("multiLine", "true"). \ load(self.storeDailyGoalForecastPath).toDF(*self.storeDailyGoalForecastCols) # dfStoreDailyGoalForecast.withColumnRenamed("Date", "date"). \ # withColumnRenamed("Day % to Forecast", "daypercentforecast"). \ # withColumnRenamed("Daily Forecast", "dailyforecast").registerTempTable("StoreDailyGoalForecast") # dfStoreDailyGoalForecastFinal = self.sparkSession.sql("select date, daypercentforecast, dailyforecast from " # "StoreDailyGoalForecast") dfStoreDailyGoalForecast.coalesce(1).write.mode('overwrite').format('parquet').\ save(self.storeDailyGoalForecastWorkingPath) dfStoreDailyGoalForecast.coalesce(1).withColumn("year", year(from_unixtime(unix_timestamp()))).\ withColumn("month", substring(from_unixtime(unix_timestamp()), 6, 2)).\ write.mode('append').partitionBy('year', 'month').format('parquet').save( self.storeDailyGoalForecastPartitionPath) self.sparkSession.stop()
def prepare_data(spark, sample_size): """ Clean up raw data and split the data into train and split :param spark: SparkSession :param sample_size: int, number of data size to select :return : train, test, both of them are spark dataframes """ movies = spark.read.load('data/ml-20m/movies.csv', format='csv', sep=',', inferSchema='true', header='true') ratings = spark.read.load('data/ml-20m/ratings.csv', format='csv', sep=',', inferSchema='true', header='true') """ take subset of database (can't do ordering here as it is really slow over a distributed database) """ n_ratings = ratings.limit(sample_size) # remove movies and users 1 rating user_filter = (n_ratings.groupBy('userId').agg( count('userId').alias('count')).filter( col('count') == 1).select('userId')) movie_filter = (n_ratings.groupBy('movieId').agg( count('movieId').alias('count')).filter( col('count') == 1).select('movieId')) n_ratings = n_ratings.join(user_filter, ['userId'], how='left_anti') n_ratings = n_ratings.join(movie_filter, ['movieId'], how='left_anti') # movies with valid genre movies_genre = movies.filter(col('genres') != '(no genres listed)') # remove year movies_genre = movies_genre.withColumn('year', substring(col('title'), -5, 4)) genre_filter = movies_genre.select('movieId') # keep only movies with genre n_ratings = n_ratings.join(genre_filter, ['movieId'], how='left_semi') # test train split train, test = n_ratings.randomSplit([0.8, 0.2], seed=12345) # take union set of users, movies in both data pieces train = train.join(test.select('userId'), ['userId'], how='left_semi') train = train.join(test.select('movieId'), ['movieId'], how='left_semi') test = test.join(train.select('userId'), ['userId'], how='left_semi') test = test.join(train.select('movieId'), ['movieId'], how='left_semi') return train, test
def predict(args): flight_features = args["feature"].split(",") features = spark.createDataFrame( [(flight_features[0], flight_features[1], flight_features[2], flight_features[3], float( flight_features[4]), float(flight_features[5]))], schema=feature_schema) features = features.withColumn( 'CRS_DEP_HOUR', substring(convert_time_to_hour("CRS_DEP_TIME"), 0, 2)) result = model.transform(features).collect()[0].prediction return {"result": result}
def createDataFrameHTMLPreview(dataframe): from pyspark.sql.functions import substring #remove id #cast to string, limit to 50 removeid = filter(lambda x: x != "_id", dataframe.columns) limitto50 = list(map(lambda x:(substring(col(x).cast("String"),0,100)).alias(x),removeid)) pandas = dataframe.limit(displaylimit).select(limitto50).toPandas() html = pandas.to_html() return html
async def run( self, spark, filter='2014*', start='2014-01-01', end='2015-01-01', mode='append', **inputs, ): print('bitmex trades csv->parquet') df = spark.read \ .csv( f's3a://stackpoint-spark/data/bitmex/trades/{filter}.csv.gz', header=True, schema=BitmexTrades, ) df = df.filter(df.timestamp >= start) \ .filter(df.timestamp < end) df = df.drop('grossValue') df = df.drop('homeNotional') df = df.drop('foreignNotional') # cut milli/nanoseconds and convert to timestamp conv_timestamp = F.to_timestamp(F.substring(df.timestamp, 0, 23), TIMESTAMP) df = df.withColumn("timestamp", conv_timestamp) df = df.withColumn("time", F.unix_timestamp(df.timestamp)) df = df.withColumn("date", F.from_unixtime(df.time, "yyyyMMdd")) df.drop('timestamp') df.limit(5).show() count = df.count() print(f'{count} trades') volume = df.agg(F.sum("size")).collect()[0][0] print(f'total volume: ${volume}') print('writing parquet') df.write \ .mode(mode) \ .partitionBy(['date']) \ .parquet('s3a://stackpoint-spark/data/bitmex/parquet/trades') print('done writing') return { 'trades': count, 'volume': volume, }
def apply_basic_aggregations(df): """ Take all transacations and apply some basic aggregations at the harp_property_id_no grain :param df: :return: """ # Where for a given harp ID, hotel chain, lat/long, hotel_name has changed over the course of the analysis window, # simply take the most recent hotel_name df_current_hotel_names = \ df \ .groupBy('harp_property_id_no', 'hotel_name', 'hotel_chain_name', 'city_code', 'x', 'y', ) \ .agg(F.max('created_date').alias('max_created_date')) \ .withColumn('hotel_name_rank', F.dense_rank().over(Window .partitionBy(F.col('harp_property_id_no')) .orderBy(F.col('max_created_date').desc()))) \ .filter('hotel_name_rank = 1') \ .drop('hotel_name_rank', 'max_created_date') # Replace hotel names with most current iteration and aggregate down to the individual property grain return \ df \ .drop('hotel_name') \ .join(df_current_hotel_names, on=['harp_property_id_no', 'hotel_chain_name', 'city_code', 'x', 'y', ]) \ .filter('star_rating is not null AND length(star_rating) > 1') \ .withColumn('star_rating_number', F.substring('star_rating', 0, 1).cast('integer')) \ .groupBy('harp_property_id_no', 'hotel_name', 'hotel_chain_name', 'city_code', 'x', 'y', ) \ .agg(F.count(F.lit(1)).alias('num_bookings'), F.max('star_rating').alias('star_rating'), F.max('star_rating_number').alias('star_rating_number'), F.avg('yield').alias('avg_yield'), F.avg('adr').alias('avg_adr'), )