def removePunctuation(column): no_punct = regexp_replace(column, "\p{Punct}", '') lowered = lower(no_punct) cleaned = trim(lowered) return cleaned """Removes punctuation, changes to lower case, and strips leading and trailing spaces.
def removePunctuation(column): """Removes punctuation, changes to lower case, and strips leading and trailing spaces. Note: Only spaces, letters, and numbers should be retained. Other characters should should be eliminated (e.g. it's becomes its). Leading and trailing spaces should be removed after punctuation is removed. Args: column (Column): A Column containing a sentence. Returns: Column: A Column named 'sentence' with clean-up operations applied. """ #column_val = regexp_replace(column, "\p{Punct}", "") #return trim(lower(column_val)) word = lower(trim(regexp_replace(regexp_replace(column, '[^\w\s]', ''),'_',''))).alias("word") return word
def removePunctuation(column): """Removes punctuation, changes to lower case, and strips leading and trailing spaces. Note: Only spaces, letters, and numbers should be retained. Other characters should should be eliminated (e.g. it's becomes its). Leading and trailing spaces should be removed after punctuation is removed. Args: column (Column): A Column containing a sentence. Returns: Column: A Column named 'sentence' with clean-up operations applied. """ return (trim(regexp_replace(lower(column),'[^a-zA-Z0-9 ]','')).alias('sentence'))
def removePunctuation(column): """Removes punctuation, changes to lower case, and strips leading and trailing spaces. Note: Only spaces, letters, and numbers should be retained. Other characters should should be eliminated (e.g. it's becomes its). Leading and trailing spaces should be removed after punctuation is removed. Args: column (Column): A Column containing a sentence. Returns: Column: A Column named 'sentence' with clean-up operations applied. """ # assert(isinstance(column, pyspark.sql.column.Column)) assert(str(type(column)) == "<class 'pyspark.sql.column.Column'>") columnNoPunct = regexp_replace(column, "[^a-zA-Z0-9 ]", "") # columnNoPunct = regexp_replace(column, string.punctuation, "") columnLowerCase = lower(columnNoPunct) columnTrimmed = trim(columnLowerCase) return columnTrimmed
print(f"How many TX records did we get?") tx_cnt = texas_df.count() print(f"We got: %i " % tx_cnt) #Rename our LoanRange column to an estimated loan amount to match the existing sub 150k loan data. filtered_df = texas_df.select( col("LoanRange").alias("LoanAmount"), "City", "State", "Zip", "BusinessType", "NonProfit", "JobsRetained", "DateApproved", "Lender") #Doing some regular expressions to replace the text values with the average dollar amount and turning the column into a double type value_df = filtered_df.select("City", "State", "Zip", "BusinessType", "NonProfit", "JobsRetained", "DateApproved", "Lender") value_df = filtered_df.withColumn( "LoanAmount", regexp_replace(col("LoanAmount"), "[a-z] \$5-10 million", "7500000").cast("double")) value_df = value_df.withColumn( 'LoanAmount', regexp_replace(col("LoanAmount"), "[a-z] \$1-2 million", "1500000").cast("double")) value_df = value_df.withColumn( 'LoanAmount', regexp_replace(col("LoanAmount"), "[a-z] \$5-10 million", "7500000").cast("double")) value_df = value_df.withColumn( 'LoanAmount', regexp_replace(col("LoanAmount"), "[a-z] \$2-5 million", "3500000").cast("double")) value_df = value_df.withColumn( 'LoanAmount', regexp_replace(col("LoanAmount"), "[a-z] \$350,000-1 million",
new_viagens_df = viagens_df.select( col('Identificador do processo de viagem').alias( 'identificador_do_processo_de_viagem'), col('Situação').alias('situacao'), col('Código do órgão superior').alias('codigo_do_orgao_superior'), col('Nome do órgão superior').alias('nome_do_orgao_superior'), col('Código órgão solicitante').alias('codigo_orgao_solicitante'), col('Nome órgão solicitante').alias('nome_orgao_solicitante'), col('CPF viajante').alias('cpf_viajante'), col('Nome').alias('nome'), col('Cargo').alias('cargo'), col('Período - Data de início').alias('periodo_data_de_inicio'), col('Período - Data de fim').alias('periodo_data_de_fim'), col('Destinos').alias('destinos'), col('Motivo').alias('motivo'), regexp_replace(col('Valor diárias'), ",", "").cast("decimal").alias("valor_diarias"), regexp_replace(col('Valor passagens'), ",", "").cast("decimal").alias("valor_passagens"), regexp_replace(col('Valor outros gastos'), ",", "").cast("decimal").alias("valor_outros_gastos")).cache() # COMMAND ---------- #exercício 1 new_viagens_df.write.mode('overwrite').parquet(output_path + "viagens_parquet") # COMMAND ---------- #exercício 2 viagens_df.coalesce(1).write.mode('overwrite')\
# fill null values in Headline with '' df_news = df_news.fillna({'Headline': ''}) # parse the timestamp in order to make time windows df_news = df_news.withColumn('PublishDate1', F.to_date('PublishDate', "yyyy-MM-dd HH:mm:ss")) df_timestamped = df_news.select(['PublishDate1', 'Topic', 'Title', 'Headline']) # drop duplicates #df_timestamped = df_timestamped.dropDuplicates(['Title', 'Headline']) # remove punctuation from text data, text to lower case and trim whitespaces df_timestamped = df_timestamped.withColumn( 'Title', F.trim(F.lower(F.regexp_replace(F.col('Title'), '[^\sa-zA-Z0-9]', '')))) df_timestamped = df_timestamped.withColumn( 'Headline', F.trim(F.lower(F.regexp_replace(F.col('Headline'), '[^\sa-zA-Z0-9]', '')))) # tokenize titles and headlines title_tokenizer = Tokenizer(inputCol='Title', outputCol='Title_words') headline_tokenizer = Tokenizer(inputCol='Headline', outputCol='Headline_words') df_timestamped = title_tokenizer.transform(df_timestamped) df_timestamped = headline_tokenizer.transform(df_timestamped) # remove stop words titel_remover = StopWordsRemover(inputCol='Title_words', outputCol='Title_final') headline_remover = StopWordsRemover(inputCol='Headline_words', outputCol='Headline_final')
How-to get it done? I've broken this task to 5 steps which are as following: #Step-1: Break fits_assembly_name column into assembly_name & models After observing the pattern, I've figured out following - Model numbers are mentioned after hyphen(-) in fits_assembly_name column, and - They are condensed to fit the space. E.g. Three different model_numbers V08AB26, V08GB26 and V08LB26 are written as V08AB26/GB26/LB26. - POS has used "/" and mentioned only the part that is different from earlier model_numbers. To pre-process the column Assembly_name, use regexp_replace + split to separate modelsnumbers into a new column and remove it from the original column Assembly_name: - I've used "regexp_replace" and "selectExpr" funtions available in pyspark.sql.functions. - I've broken the fits_assembly_name column string by " - " and created two new columns i.e. pc_Assemblyname_Withoutmodelno and Models column as following: - regexp_replace: Replace all substrings of the specified string value that match regexp with rep - Usage: regexp_replace(x, pattern, replacement) - selectExpr: Projects a set of SQL expressions and returns a new DataFrame. (Source: https://spark.apache.org/docs/1.5.2/api/python/pyspark.sql.html) ``` from pyspark.sql.functions import regexp_replace, split df0 = df.withColumn('new_col', split(regexp_replace('Assembly_name', r'^(.*)-\s*(\S+)(.*)$', '$1$3\0$2'),'\0')) \ .selectExpr( 'Itemno' , 'Assembly_id' , "coalesce(new_col[0], Assembly_name) as Assembly_name" , "coalesce(new_col[1], '') as models" ) df0.show(truncate=False) +-------+-----------+---------------------------------------------------------------+--------------------+ |Itemno |Assembly_id|Assembly_name |models |
# We then split the words into tokens. # https://spark.apache.org/docs/latest/ml-features.html#tokenizer # In[9]: from pyspark.sql.functions import regexp_replace, trim, col, lower, udf from pyspark.ml.feature import Tokenizer, RegexTokenizer from pyspark.ml.feature import StopWordsRemover timestart = datetime.datetime.now() print("abstracts_full_df2.head() = {}".format(abstracts_full_df2.head())) # Convert the content to Lower Case print("Converting the abstarct to Lower Case ... ") abstracts_full_df3 = abstracts_full_df2.withColumn("abstractNew", lower(col("abstract"))).\ withColumn("abstractNew", regexp_replace("abstractNew", '[^\w-_ ]', "")) abstracts_full_df3.printSchema() # print("abstracts_full_df3.head() = {}".format(abstracts_full_df3.head())) # Tokenize the Abstracts print("tokenizating the abstracts... ") tokenizer = Tokenizer(inputCol="abstractNew", outputCol="words") remover = StopWordsRemover(inputCol="words", outputCol="filtWords") abstracts_full_df4 = tokenizer.transform(abstracts_full_df3) print("After tokenization: ") abstracts_full_df4.printSchema() print("abstracts_full_df4.count() = {}".format(abstracts_full_df4.count())) # print("abstracts_full_df4.head() = {}".format(abstracts_full_df4.head()))
#verificando la estructura del dataframe WordlCupMatches df_wcm.printSchema() print('El dataframe df_wc tiene ' + str(df_wcm.count()) + ' registros.') print('El dataframe df_wc tiene ' + str(df_wcm.distinct().count()) + ' registros distintos.') # COMMAND ---------- from pyspark.sql.functions import desc, asc, col, column, expr, instr, length, substring, regexp_replace, trim, lit, initcap, sum, concat # #Transformaciones al datraframe WordlCupPlayers # El campo Event guarda un dato del tipo "G43' G87'", lo que significa que el jugador usar dos goles, por lo que necesitamos contar el número de "G" # De igual manera podemos obtener el número de penales y tarjetas df_wcp1=df_wcp.withColumn('POSICION_JUGADOR',expr("case when position='C' THEN 'Captain' WHEN position='GK' THEN 'Goalkeeper' ELSE 'Other' end "))\ .withColumn('NOMBRE_JUGADOR', initcap(regexp_replace('Player Name','�','u')))\ .withColumn('NUMERO_GOLES',length('Event')-length(trim(regexp_replace('Event','G',''))))\ .withColumn('NUMERO_PENALES',length('Event')-length(trim(regexp_replace('Event','P',''))))\ .withColumn('NUMERO_PENALES_FALLADOS',length('Event')-length(trim(regexp_replace('Event','MP',''))))\ .withColumn('NUMERO_TARJETAS_ROJAS',length('Event')-length(trim(regexp_replace('Event','R','')))) #Reemplazo los valores nulos con 0 df_wcp1=df_wcp1.withColumn('NUMERO_GOLES',expr("case when NUMERO_GOLES is null then 0 else NUMERO_GOLES end "))\ .withColumn('NUMERO_PENALES',expr("case when NUMERO_PENALES is null then 0 else NUMERO_PENALES end "))\ .withColumn('NUMERO_PENALES_FALLADOS',expr("case when NUMERO_PENALES_FALLADOS is null then 0 else NUMERO_PENALES_FALLADOS end "))\ .withColumn('NUMERO_TARJETAS_ROJAS',expr("case when NUMERO_TARJETAS_ROJAS is null then 0 else NUMERO_TARJETAS_ROJAS end "))\ .withColumnRenamed('Team Initials','INICIALES_PAIS')\ .drop('Player Name','Position','Shirt Number') #Sumarizo por nombre de jugador , posición e iniciales de país df_wcp_rep=df_wcp1.select('NOMBRE_JUGADOR','POSICION_JUGADOR','INICIALES_PAIS','NUMERO_GOLES','NUMERO_PENALES','NUMERO_PENALES_FALLADOS','NUMERO_TARJETAS_ROJAS')\ .groupby('NOMBRE_JUGADOR','POSICION_JUGADOR','INICIALES_PAIS')\
import sys from pyspark import SparkContext from pyspark.sql import SQLContext from pyspark.sql.functions import col, regexp_replace def hathiRecord(r): return dict([(f["@name"], f["#VALUE"]) for f in r.field]) if __name__ == "__main__": if len(sys.argv) < 3: print("Usage: pretty-cluster.py <input> <output>", file=sys.stderr) exit(-1) sc = SparkContext(appName="Prettyprint Clusters") sqlContext = SQLContext(sc) raw = sqlContext.read.format('com.databricks.spark.xml') \ .options(rowTag='doc') \ .load(sys.argv[1]) sqlContext.createDataFrame(raw.map(hathiRecord), samplingRatio=1) \ .withColumn('seq', col('seq').cast('int')) \ .withColumnRenamed('htid', 'book') \ .withColumnRenamed('content', 'text') \ .withColumnRenamed('year', 'date') \ .withColumn('text', regexp_replace(regexp_replace(col('text'), '&', '&'), '<', '<')) \ .repartition(200) \ .write.save(sys.argv[2]) sc.stop()
print("Start %d Cities Cluster Test, NSlaves = %d" % (NCities, NSlaves), flush=True) print("Execution time #0 %f" % (time.time() - startTime),flush=True) # ## Read two mapping files into dataframes # - Read files from Amazon S3 bucket into Spark dataframes # - Format columns as required to enable joins to dataset below # read and process city FIPS to county FIPS mapping file city_to_fips = spark.read.format("org.apache.spark.csv").option("header","true") \ .csv(latlonFilename) # .csv("/home/ubuntu/project/data/uscitiesLatLongFIPS.csv") city_to_fips = city_to_fips.withColumn("county_FIPS", f.lpad(city_to_fips['county_FIPS'],5,"0")) city_to_fips = city_to_fips.drop("city","zip","id","source","population") city_to_fips = city_to_fips.withColumn("city_ascii", f.regexp_replace('city_ascii', 'Saint', 'St.')) city_to_fips = city_to_fips.withColumnRenamed("city_ascii","CityName") \ .withColumnRenamed("state_name","StateDesc") \ .withColumnRenamed("county_FIPS","FIPS") print((city_to_fips.count(), len(city_to_fips.columns))) city_to_fips.limit(5).toPandas() # read and process commuting zone to county FIPS mappingfile cz_to_fips = spark.read.format("org.apache.spark.csv").option("header","true").option("delimiter", "\t") \ .csv(lmaFilename) # .csv("/home/ubuntu/project/data/1990LMAascii.csv") cz_to_fips = cz_to_fips.filter(cz_to_fips.FIPS !="None") cz_to_fips = cz_to_fips.withColumn("stateabbrv", cz_to_fips["County Name"].substr(-2,99))
dfStreams = dfparquetSrcStreams.withColumn("year",sf.split("createdDate","\-")[0]) \ .withColumn("month",sf.split("createdDate","\-")[1]) \ .withColumn("day",sf.split((sf.split((sf.split("createdDate","\-")[2]),"T")[0])," ")[0]) dfbaseDataTrans = dfTrans.select([col for col in dfTrans.columns]) dfbaseDataSummary = dfSummary.select([col for col in dfSummary.columns]) dfbaseDataStreams = dfStreams.select([col for col in dfStreams.columns if not col.startswith("streams_transactions_")]) dfbaseDataMapTblInt = dfStreams.select(sf.col("id"),sf.col("year"),sf.col("month"),sf.col("day"),sf.col("streams_id"),dfStreams.colRegex("`streams_transactions_[0-9_]+_id`")) dfbaseDataMapTblStr = dfStreams.select(sf.col("id"),sf.col("year"),sf.col("month"),sf.col("day"),sf.col("streams_id"),dfStreams.colRegex("`streams_transactions_[a-zA-Z_]*`")) dfbaseDataMapTblInt_1 = dfbaseDataMapTblInt.withColumn("streams_transactions_id", concat_udf(sf.sort_array(sf.array([col for col in dfbaseDataMapTblInt.columns if col.startswith("streams_transactions_")])))) dfbaseDataMapTblInt_2 = dfbaseDataMapTblInt_1.select(sf.col("id"),sf.col("year"),sf.col("month"),sf.col("day"),sf.col("streams_id"),sf.col("streams_transactions_id")) dfbaseDataMapTblInt_2 = dfbaseDataMapTblInt_2.withColumn("transactions_id",sf.explode(sf.split(sf.trim(sf.regexp_replace("streams_transactions_id","~"," "))," "))) # .withColumn("replace",sf.trim(sf.regexp_replace("streams_transactions_id","~"," "))) \ # dfbaseDataMapTblInt_2.show(10,False) dfbaseDataTransFinal = dfbaseDataTrans.select(sf.col("id").alias("mongoId"),sf.col("year"),sf.col("month"),sf.col("day"),sf.col("transactions_id").alias("tran_transactions_id"),sf.col("transactions_value").alias("tran_transactions_value"),sf.col("transactions_date").alias("tran_transactions_date")).distinct() dfbaseDataSummaryFinal = dfbaseDataSummary.select(sf.col("id").alias("mongoId"),sf.col("year"),sf.col("month"),sf.col("day"),sf.col("summary_end_date"),sf.col("summary_num_transactions"),sf.col("summary_start_date"),sf.col("summary_total_irregular_income"),sf.col("summary_total_regular_income")) # dfbaseDataStreamsFinal = dfbaseDataStreams.select(sf.col("id"),sf.col("year"),sf.col("month"),sf.col("day"),sf.col("applicantId"),sf.col("applicationSource") \ # ,sf.col("clientID"),sf.col("createdDate"),sf.col("loanApplicationId"),sf.col("mvpApplicantId") \ # ,sf.col("noHit"),sf.col("successful"),sf.col("timestamp"),sf.col("updatedAt"),sf.col("createdDatePT") \ # ,sf.col("transactions_id").alias("tran_transactions_id"),sf.col("transactions_value").alias("tran_transactions_value"),sf.col("transactions_date").alias("tran_transactions_date")).distinct() dfbaseDataMapTblInt_3 = dfbaseDataMapTblInt_2.select(sf.col("id").alias("mongoId"),sf.col("year"),sf.col("month"),sf.col("day"),sf.col("streams_id").alias("mptbl_streams_id"),sf.col("transactions_id").alias("mptbl_transactions_id"))
#twitter datasource_tw = glueContext.create_dynamic_frame.from_catalog( database="dynamodb", table_name="twitter") dynframe_twitter = ApplyMapping.apply( frame=datasource_tw, mappings=[("symbol", "string", "symbol", "string"), ("full_text", "string", "full_text", "string"), ("created_at", "string", "created_at", "string"), ("id", "long", "id", "long"), ("url", "string", "url", "string")]) #convert aws glue dynamicframes to spark dataframes tw = dynframe_twitter.toDF() #remove [\\n\\t\$#] tw = tw.withColumn("full_text", f.regexp_replace(f.col("full_text"), "[\\n\\t\$#]", "")) #convert spark dataframes back to aws glue dynamicframes dynframe_twitter = DynamicFrame.fromDF(tw, glueContext, "nested") #partition to 1 to get a single s3 file as output dynframe_output = dynframe_twitter.repartition(1) datasink = glueContext.write_dynamic_frame.from_options( frame=dynframe_output, connection_type="s3", connection_options={"path": "s3://541304926041-twitter"}, format="csv") job.commit()
aws_secret_access_key=secret_key, region_name="us-west-2") srcfilePath = "s3://" + bucket + "/" + enriched_path + vendor + "/JSON/" + year + "/" + month + "/" + day + "" tgtfilePath = "s3://" + bucket + "/" + enriched_path + vendor + "/Parquet/" dfjson = sparkSession.read.format("json").option("multiline", "true").option( "inferSchema", "true").load(srcfilePath) data = dfjson.withColumn("data", explode("DATA")).select("data.*") # dfPT = data.withColumn("createdDatePT",sf.to_timestamp(udf_TZConversion(sf.regexp_replace(data.createdDate,"T"," ").cast("string"),sf.lit("UTC"),sf.lit("US/Pacific")),"yyyy-MM-dd HH:mm:ss")) dfPT = data.withColumn( "createdDatePT", sf.from_utc_timestamp(sf.regexp_replace(data.createdDate, "T", " "), "US/Pacific")) df = dfPT.withColumn("year",sf.split("createdDate","\-")[0]) \ .withColumn("month",sf.split("createdDate","\-")[1]) \ .withColumn("day",sf.split((sf.split((sf.split("createdDate","\-")[2]),"T")[0])," ")[0]) dfbaseData = df.select([col for col in df.columns]) #dfbaseData.show(10,False) dfrankedId = dfbaseData.withColumn("row_num", sf.row_number().over(Window.partitionBy("id").orderBy(sf.asc("updatedAt")))) \ .where(sf.col("row_num") == 1) \ .select(dfbaseData["*"]) dfrankedId.repartition(sf.col("year"),sf.col("month"),sf.col("day")) \
def __launcher_exposure(): """ launcher页曝光 """ sql_0 = """ select site,title,grouping_id() id_1,count(custom_uuid) playNum,round(sum(unix_timestamp(exit)-unix_timestamp(enter))/3600,2) playTime,count(distinct custom_uuid) users,round(count(custom_uuid)/count(distinct custom_uuid),2) avgPlayNum,round((sum(unix_timestamp(exit)-unix_timestamp(enter))/count(distinct custom_uuid))/60,2) avgPlayTime from sharp.launcher_page_stay where dt="{date_0}" and exit >= enter and unix_timestamp(exit)-unix_timestamp(enter) <= 1800 group by site,title with cube """.format( date_0=__str_dt_0) sql_1 = """ select site,title,grouping_id() id_1,count(custom_uuid) playNum,round(sum(unix_timestamp(exit)-unix_timestamp(enter))/3600,2) playTime,count(distinct custom_uuid) users,round(count(custom_uuid)/count(distinct custom_uuid),2) avgPlayNum,round((sum(unix_timestamp(exit)-unix_timestamp(enter))/count(distinct custom_uuid))/60,2) avgPlayTime from sharp.launcher_page_stay where dt="{date_1}" and exit >= enter and unix_timestamp(exit)-unix_timestamp(enter) <= 1800 group by site,title with cube """.format( date_1=__str_dt_1) sql_7 = """ select site,title,grouping_id() id_1,count(custom_uuid) playNum,round(sum(unix_timestamp(exit)-unix_timestamp(enter))/3600,2) playTime,count(distinct custom_uuid) users,round(count(custom_uuid)/count(distinct custom_uuid),2) avgPlayNum,round((sum(unix_timestamp(exit)-unix_timestamp(enter))/count(distinct custom_uuid))/60,2) avgPlayTime from sharp.launcher_page_stay where dt="{date_7}" and exit >= enter and unix_timestamp(exit)-unix_timestamp(enter) <= 1800 group by site,title with cube """.format( date_7=__str_dt_7) spark.sql("show databases") spark.sql("use sharp") df_cube_0 = spark.sql(sql_0) df_cube_1 = spark.sql(sql_1) df_cube_7 = spark.sql(sql_7) # 连接条件 left_conditions_0_1 = (F.coalesce( F.col("t_0.site"), F.lit("123")) == F.coalesce( F.col("t_1.site"), F.lit("123"))) & (F.coalesce( F.col("t_0.title"), F.lit("123")) == F.coalesce( F.col("t_1.title"), F.lit("123"))) & (F.col("t_0.id_1") == F.col("t_1.id_1")) left_conditions_0_7 = (F.coalesce( F.col("t_0.site"), F.lit("123")) == F.coalesce( F.col("t_7.site"), F.lit("123"))) & (F.coalesce( F.col("t_0.title"), F.lit("123")) == F.coalesce( F.col("t_7.title"), F.lit("123"))) & (F.col("t_0.id_1") == F.col("t_7.id_1")) # 最终报表 report = df_cube_0.alias("t_0").join( df_cube_1.alias("t_1"), left_conditions_0_1, "left_outer" ).join(df_cube_7.alias("t_7"), left_conditions_0_7, "left_outer").select( F.regexp_replace(F.lit(__str_dt_0), "-", "").cast("int").alias("date"), F.col("t_0.site").alias("channelName"), F.col("t_0.title").alias("typeName"), F.col("t_0.id_1").alias("id_1"), F.col("t_0.playNum").alias("totalPlayNum"), F.concat( F.round((F.col("t_0.playNum") / F.col("t_1.playNum") - 1) * 100, 2), F.lit("%")).alias("playNumCompareDay"), F.concat( F.round((F.col("t_0.playNum") / F.col("t_7.playNum") - 1) * 100, 2), F.lit("%")).alias("playNumCompareWeek"), F.col("t_0.playTime").alias("totalPlayTime"), F.concat( F.round((F.col("t_0.playTime") / F.col("t_1.playTime") - 1) * 100, 2), F.lit("%")).alias("playTimeCompareDay"), F.concat( F.round((F.col("t_0.playTime") / F.col("t_7.playTime") - 1) * 100, 2), F.lit("%")).alias("playTimeCompareWeek"), F.col("t_0.users").alias("totalUserCount"), F.concat( F.round((F.col("t_0.users") / F.col("t_1.users") - 1) * 100, 2), F.lit("%")).alias("userCountCompareDay"), F.concat( F.round((F.col("t_0.users") / F.col("t_7.users") - 1) * 100, 2), F.lit("%")).alias("userCountCompareWeek"), F.col("t_0.avgPlayNum").alias("averagePlayNum"), F.concat( F.round( (F.col("t_0.avgPlayNum") / F.col("t_1.avgPlayNum") - 1) * 100, 2), F.lit("%")).alias("avgPlayNumCompareDay"), F.concat( F.round( (F.col("t_0.avgPlayNum") / F.col("t_7.avgPlayNum") - 1) * 100, 2), F.lit("%")).alias("avgPlayNumCompareWeek"), F.col("t_0.avgPlayTime").alias("averagePlayTime"), F.concat( F.round((F.col("t_0.avgPlayTime") / F.col("t_1.avgPlayTime") - 1) * 100, 2), F.lit("%")).alias("avgPlayTimeCompareDay"), F.concat( F.round((F.col("t_0.avgPlayTime") / F.col("t_7.avgPlayTime") - 1) * 100, 2), F.lit("%")).alias("avgPlayTimeCompareWeek")) return report
from __future__ import print_function import sys from pyspark import SparkContext from pyspark.sql import SQLContext from pyspark.sql.functions import lit, concat, concat_ws, regexp_replace if __name__ == "__main__": if len(sys.argv) != 3: print("Usage: trove-load.py <input json> <output parquet>", file=sys.stderr) exit(-1) sc = SparkContext(appName="Trove Load") sqlContext = SQLContext(sc) raw = sqlContext.read.json(sys.argv[1]) df = raw.na.drop(subset=['id', 'fulltext']).dropDuplicates(['id']) df.select(concat(lit('trove/'), df.id).alias('id'), concat_ws('/', lit('trove'), df.titleId, df.date).alias('issue'), concat(lit('trove/'), df.titleId).alias('series'), df.date, df.firstPageId, df.firstPageSeq.cast('int').alias('seq'), df.heading.alias('title'), df.category, regexp_replace(regexp_replace(df.fulltext, '&', '&'), '<', '<').alias('text'))\ .write.save(sys.argv[2]) sc.stop()
res['name'] = book res['text'] = "\n".join(['<div class="page-break" page="%d">%s</div>' % (r.seq, r.text) for r in pp]) + ('<archiveid tokenizetagcontent="false">%s</archiveid>' % book) return Row(**res) if __name__ == "__main__": if len(sys.argv) < 3: print("Usage: pretty-cluster.py <input> <page-out> <book-out>", file=sys.stderr) exit(-1) sc = SparkContext(appName="Proteus Pages") sqlContext = SQLContext(sc) raw = sqlContext.read.load(sys.argv[1]) cols = set(raw.columns) idcols = [col(x) for x in ['identifier', 'issue', 'book'] if x in cols] df = raw.withColumn('identifier', regexp_replace(coalesce(*idcols), '[^A-Za-z0-9]+', '')) counts = df.groupBy('identifier').count().select(col('identifier'), col('count').alias('imagecount')) appendID = udf(lambda book, text: '%s <archiveid tokenizetagcontent="false">%s</archiveid>' % (text, book)) renamed = df.join(counts, 'identifier')\ .drop('regions')\ .withColumn('pageNumber', col('seq'))\ .withColumn('name', concat_ws('_', col('identifier'), col('seq')))\ .withColumn('text', regexp_replace(col('text'), '\\n', '<br>\\\n')) renamed.withColumn('text', appendID(col('identifier'), col('text')))\ .write.format('json').save(sys.argv[2]) renamed.rdd.groupBy(lambda r: r.identifier).map(pageCat).toDF()\
import sys spark = SparkSession.builder.appName("my_pp").getOrCreate() joined_df = spark.read.format('csv').options( header='false', inferschema='true').load(sys.argv[1]).select( F.col('_c0').alias('medallion'), F.col('_c3').alias('pickup_datetime')) medallion_stats = joined_df.withColumn( "pickup_datetime", F.date_format(F.col("pickup_datetime"), "yyyy-MM-dd")) medallion_stats = medallion_stats.groupBy(F.col('medallion')).agg( F.count('*').alias('total_trips'), F.countDistinct(F.col('pickup_datetime')).alias('days_driven')) medallion_stats = medallion_stats.select( 'medallion', 'total_trips', 'days_driven', F.regexp_replace( F.format_number( F.round(F.col('total_trips') / F.col('days_driven'), 2), 2), ',', '').alias('average')).sort('medallion') medallion_stats.select( format_string('%s,%s,%s,%s', medallion_stats.medallion, medallion_stats.total_trips, medallion_stats.days_driven, medallion_stats.average)).write.save('task2d-sql.out', format="text") spark.stop()
SparkContext.setSystemProperty("hive.metastore.uris", "http://192.168.58.24:8888") spark.conf.set("spark.sql.crossJoin.enabled", "true") master = spark.sql('SELECT * FROM dwhdb.master_matching') #master.limit(10).toPandas() delta = spark.sql('SELECT * FROM dwhdb.delta_matching') #delta.limit(10).toPandas() master = (master.withColumn( "clean_id", F.regexp_replace( F.trim( F.lower(F.regexp_replace('nomor_identitas', "[^a-zA-Z0-9\\s]", ""))), " +", " ")).withColumn( "clean_nama", F.regexp_replace( F.trim( F.lower( F.regexp_replace('nama_sesuai_identitas', "[^a-zA-Z0-9\\s]", ""))), " +", " ")).withColumn( "clean_tgl_lahir", F.regexp_replace( F.trim( F.lower( F.regexp_replace('tanggal_lahir', "[^a-zA-Z0-9\\s]", ""))),
def classify_data(self, file_name): """Profiles columns from input file.""" try: session_id = datetime.datetime.now(timezone).strftime("%Y-%m-%d %H:%M:%S") print("\n\n=====Python Job starts at {session_id}\n".format(session_id=session_id)) save = HyperProfiling.get_url('/dataclass/out/save') update = HyperProfiling.get_url('/dataclass/que/update') if save.split(':')[0] and update.split(':')[0] != 'https': raise HttpProtocolException(save, update, save.split(':')[0], update.split(':')[0]) session = HyperProfiling().get_session() hdfs_output_dir = '/{env}/EDW/DSODB/OPS/CHEVELLE/HYPER_PROF_RSLT/Output'.format(env=self._env) input_path = '/{env}/EDW/DSODB/OPS/CHEVELLE/HYPER_PROF_RSLT/Input/{file_name}' base_query = \ """ select cast('{COL_ID}' as string) as col_id, cast('{DATA_CLS_NM}' as string) as data_cls_nm, cast('{CRT_TS}' as string) as crt_ts, cast('{PROF_START_TS}' as string) as prof_start_ts, cast('{PROF_END_TS}' as string) as prof_end_ts, cast('{TABLE_ID}' as string) as table_id, cast('{BATCH_ID}' as int) as batch_id, cast('{TOTAL_ROW_COUNT}' as int) as tot_row_cnt, cast('{SAMPLE_ROW_COUNT}' as int) as sample_row_cnt, cast(count(distinct({column})) as int) as col_val_uniq_cnt, cast(sum(case when {column} REGEXP '{REGEX_STR}' then 1 else 0 end) as int) as col_val_data_cls_cnt, cast(max({column}) as string) as col_max_val, cast(min({column}) as string) as col_min_val, cast(avg(length({column})) as int) as col_avg_len, cast('{REGEX_STR}' as string) as appl_regex_str, cast('{CRT_BY}' as string) as crt_by from {temp_table} """ file_schema = StructType([ StructField("hive_schema", StringType()), StructField("table_name", StringType()), StructField("col_name", StringType()), StructField("data_cls_nm", StringType()), StructField("regex_str", StringType()), StructField("table_id", StringType()), StructField("col_id", StringType()), StructField("batch_id", StringType()), ]) empty_schema = StructType([ StructField("col_id", StringType()), StructField("data_cls_nm", StringType()), StructField("crt_ts", StringType()), StructField("prof_start_ts", StringType()), StructField("prof_end_ts", StringType()), StructField("table_id", StringType()), StructField("batch_id", StringType()), StructField("tot_row_cnt", StringType()), StructField("sample_row_cnt", StringType()), StructField("col_val_uniq_cnt", StringType()), StructField("col_val_data_cls_cnt", StringType()), StructField("col_max_val", StringType()), StructField("col_min_val", StringType()), StructField("col_avg_len", StringType()), StructField("appl_regex_str", StringType()), StructField("crt_by", StringType()) ]) file_ = input_path.format(env=self._env, file_name=file_name) print(file_) schema_check = self._spark.read.csv(file_, sep='\x1c') schema_length = len(schema_check.columns) file_extension = file_name.split('.')[1] if file_extension not in ('txt', 'csv', 'dat'): raise InvalidFileExtension(file_name, file_extension) elif schema_length < 8: raise InvalidSchemaException(file_name, schema_length) else: print("File is ok to process") input_file_base = self._spark.read.csv(file_, sep='\x1c', schema=file_schema) input_file = input_file_base.withColumn('hive_schema', regexp_replace('hive_schema', "\-", "_")) input_restructured = input_file. \ select('table_id', 'col_id', 'batch_id', 'data_cls_nm', 'col_name', 'regex_str', F.concat_ws('.', 'hive_schema', 'table_name').alias('table')). \ orderBy('table', ascending=False) input_tables = input_restructured.select(input_restructured['table']).dropDuplicates() input_tables_broadcast = self._spark.sparkContext.broadcast(input_tables.rdd.collect()) application_start = time.time() for tables in input_tables_broadcast.value: rows_to_profile = \ input_restructured.select('table_id', 'col_id', 'batch_id', 'col_name', 'data_cls_nm', 'regex_str'). \ where(input_restructured['table'] == '{table}'.format(table=tables.table)).toLocalIterator() for row in list(rows_to_profile): col_id = row.col_id table_id = row.table_id batch_id = int(row.batch_id) data_cls_nm = row.data_cls_nm col_name = row.col_name regex_str = row.regex_str. \ replace(chr(169), chr(92)). \ replace(chr(171), chr(123)). \ replace(chr(187), chr(125)) empty_ts = datetime.datetime.now(timezone).strftime("%Y-%m-%d %H:%M:%S") failed_update_schema = \ { 'col_id': col_id, 'data_cls_nm': data_cls_nm, 'batch_id': batch_id, 'task_stat_cd': 3, 'fail_cnt': 0 } success_update_schema = \ { 'col_id': col_id, 'data_cls_nm': data_cls_nm, 'batch_id': batch_id, 'task_stat_cd': 2, 'fail_cnt': 0 } empty_update_schema = \ { 'col_id': col_id, 'data_cls_nm': data_cls_nm, 'batch_id': batch_id, 'task_stat_cd': 4, 'fail_cnt': 0 } empty_record = ( col_id, data_cls_nm, empty_ts, empty_ts, empty_ts, table_id, batch_id, "", "", "", "", "", "", "", regex_str.replace(chr(92), chr(169)).replace(chr(123), chr(171)).replace(chr(125), chr(187)), "chevelle" ) try: table_data = self._spark.table(tables.table) data_empty = table_data.rdd.isEmpty() except: try: update_output = UpdateQueue({'update': update, 'session': session}) update_response = update_output.save_results({'update_output': empty_update_schema}) empty_df = self._spark.createDataFrame([empty_record], empty_schema) empty_df.write.json(hdfs_output_dir, mode='ignore') payload = json.loads(empty_df.toJSON().collect()[0], object_pairs_hook=OrderedDict) output = SaveOutput({'save': save, 'session': session}) output_response = output.save_results({'payload': payload}) if update_response['status'] != 200: summary_msg = '{output}\tCRITICAL FAIL\tCheck Update Endpoint'.format( output=json.dumps(failed_update_schema)) print(summary_msg) sys.exit(0) elif output_response['status'] != 200: raise Exception except Exception: update_output = UpdateQueue({'update': update, 'session': session}) failed_update_response = update_output.save_results({'update_output': failed_update_schema}) exc_type, exc_value, exc_tb = sys.exc_info() print(traceback.format_exception(exc_type, exc_value, exc_tb)) if failed_update_response['status'] != 200: summary_msg = '{output}\tCRITICAL FAIL\tCheck Update Endpoint'.format( output=json.dumps(failed_update_schema)) print(summary_msg) sys.exit(0) else: failed_log = {'content': json.dumps(failed_update_schema),'status': failed_update_response['status']} r = requests.post( 'https://chevelle-elk-logger.cp-epg2i.domain.com/logging/chevelle_dc/stdout', json=failed_log) r.close() continue update_log = {'content': json.dumps(empty_update_schema), 'status':update_response['status']} r = requests.post('https://chevelle-elk-logger.cp-epg2i.domain.com/logging/chevelle_dc/stdout', json=update_log) r.close() continue else: if not data_empty: try: table_data_column = table_data.select(col_name). \ where(F.length(col_name) > 0). \ where(lower(trim(table_data[col_name])). isin(' ', 'null', 'n/a', 'unknown', 'unk', 'unspecified', 'no match row id', '__not_applicable__') == False) except Exception: try: update_output = UpdateQueue({'update': update, 'session': session}) update_response = update_output.save_results({'update_output': empty_update_schema}) empty_df = self._spark.createDataFrame([empty_record], empty_schema) empty_df.write.json(hdfs_output_dir, mode='ignore') payload = json.loads(empty_df.toJSON().collect()[0], object_pairs_hook=OrderedDict) output = SaveOutput({'save': save, 'session': session}) output_response = output.save_results({'payload': payload}) if output_response['status'] != 200: raise Exception except Exception: update_output = UpdateQueue({'update': update, 'session': session}) update_response = update_output.save_results({'update_output': failed_update_schema}) exc_type, exc_value, exc_tb = sys.exc_info() print(traceback.format_exception(exc_type, exc_value, exc_tb)) if update_response['status'] != 200: summary_msg = '{output}\tCRITICAL FAIL\tCheck Update Endpoint'.format( output=json.dumps(failed_update_schema)) print(summary_msg) sys.exit(0) else: print(failed_update_schema, update_response['status']) continue print(empty_update_schema, update_response['status']) continue else: column_row_count_full = table_data_column.count() output_strings = \ { 'COL_ID': col_id, 'DATA_CLS_NM': data_cls_nm, 'PROF_START_TS': 'START', 'PROF_END_TS': 'END', 'BATCH_ID': batch_id, 'TABLE_ID': table_id, 'TOTAL_ROW_COUNT': column_row_count_full, 'SAMPLE_ROW_COUNT': column_row_count_full, 'CRT_BY': 'chevelle', 'CRT_TS': 'CRT', 'REGEX_STR': regex_str, 'column': col_name, 'temp_table': 'temp_' } if column_row_count_full >= 1000000: column_sample = table_data_column.sample(False, 0.1) column_sample.createOrReplaceTempView('temp_sample') column_sample_count = column_sample.count() output_strings['SAMPLE_ROW_COUNT'] = column_sample_count output_strings['temp_table'] = 'temp_sample' try: profile_start = datetime.datetime.now(timezone).strftime("%Y-%m-%d %H:%M:%S") output_strings['PROF_START_TS'] = profile_start results = self._spark.sql(base_query.format(**output_strings)) profile_end = datetime.datetime.now(timezone).strftime("%Y-%m-%d %H:%M:%S") final_results = \ results.\ replace('END', profile_end, 'PROF_END_TS').\ replace('CRT', profile_end, 'CRT_TS') update_output = UpdateQueue({'update': update, 'session': session}) update_response = update_output.save_results({'update_output': success_update_schema}) final_results.write.json(hdfs_output_dir, mode='ignore') payload = json.loads(final_results.toJSON().collect()[0], object_pairs_hook=OrderedDict) payload['appl_regex_str'] = \ payload['appl_regex_str'].\ replace(chr(92), chr(169)).\ replace(chr(123), chr(171)).\ replace(chr(125), chr(187)) output = SaveOutput({'save': save, 'session': session}) output_response = output.save_results({'payload': payload}) if output_response['status'] != 200: raise Exception except Exception: update_output = UpdateQueue({'update': update, 'session': session}) update_response = update_output.save_results({'update_output': failed_update_schema}) exc_type, exc_value, exc_tb = sys.exc_info() print(traceback.format_exception(exc_type, exc_value, exc_tb)) if update_response['status'] != 200: summary_msg = '{output}\tCRITICAL FAIL\tCheck Update Endpoint'.format( output=json.dumps(failed_update_schema)) print(summary_msg) sys.exit(0) else: print(failed_update_schema, update_response['status']) continue print(success_update_schema, update_response['status']) elif 0 < column_row_count_full < 1000000: table_data_column.createOrReplaceTempView('temp_') try: profile_start = datetime.datetime.now(timezone).strftime("%Y-%m-%d %H:%M:%S") output_strings['PROF_START_TS'] = profile_start results = self._spark.sql(base_query.format(**output_strings)) profile_end = datetime.datetime.now(timezone).strftime("%Y-%m-%d %H:%M:%S") final_results = \ results.\ replace('END', profile_end, 'PROF_END_TS').\ replace('CRT', profile_end, 'CRT_TS') update_output = UpdateQueue({'update': update, 'session': session}) update_response = update_output.save_results({'update_output': success_update_schema}) final_results.write.json(hdfs_output_dir, mode='ignore') payload = json.loads(final_results.toJSON().collect()[0], object_pairs_hook=OrderedDict) payload['appl_regex_str'] = \ payload['appl_regex_str']. \ replace(chr(92), chr(169)). \ replace(chr(123), chr(171)). \ replace(chr(125), chr(187)) output = SaveOutput({'save': save, 'session': session}) output_response = output.save_results({'payload': payload}) if output_response['status'] != 200: raise Exception except Exception: update_output = UpdateQueue({'update': update, 'session': session}) update_response = update_output.save_results({'update_output': failed_update_schema}) exc_type, exc_value, exc_tb = sys.exc_info() print(traceback.format_exception(exc_type, exc_value, exc_tb)) if update_response['status'] != 200: summary_msg = '{output}\tCRITICAL FAIL\tCheck Update Endpoint'.format( output=json.dumps(failed_update_schema)) print(summary_msg) sys.exit(0) else: print(failed_update_schema, update_response['status']) continue print(success_update_schema, update_response['status']) else: try: update_output = UpdateQueue({'update': update, 'session': session}) update_response = update_output.save_results({'update_output': empty_update_schema}) empty_df = self._spark.createDataFrame([empty_record], empty_schema) empty_df.write.json(hdfs_output_dir, mode='ignore') payload = json.loads(empty_df.toJSON().collect()[0], object_pairs_hook=OrderedDict) output = SaveOutput({'save': save, 'session': session}) output_response = output.save_results({'payload': payload}) if output_response['status'] != 200: raise Exception except Exception: update_output = UpdateQueue({'update': update, 'session': session}) update_response = update_output.save_results({'update_output': failed_update_schema}) exc_type, exc_value, exc_tb = sys.exc_info() print(traceback.format_exception(exc_type, exc_value, exc_tb)) if update_response['status'] != 200: summary_msg = '{output}\tCRITICAL FAIL\tCheck Update Endpoint'.format( output=json.dumps(failed_update_schema)) print(summary_msg) sys.exit(0) else: print(failed_update_schema, update_response['status']) continue print(empty_update_schema, update_response['status']) continue except: sys.exit(0) application_end_time = time.time() print('-- Application Run time --') print(str(application_end_time - application_start)) sys.exit(0)
# This file is called word_counter.py from pyspark.sql import SparkSession from pyspark.sql import functions as F spark = SparkSession.builder.getOrCreate() df = spark.read.text(paths='/job/samples/word_counter.py') # Replace code chars with spaces df = df.withColumn('value', F.regexp_replace('value', '\W', ' ')) # Split on spaces df = df.select(F.explode(F.split('value', ' ')).alias('word')) # Filter min length df = df.where(F.length('word') > 0) # Group by occurence and order agg = (df.groupBy('word').count().sort('count', ascending=False)) # Print top result agg.limit(5).show()
df_new = df_new.fillna({'Time': 3}) #Removing na locaions of violation location and violation count df_new = df_new.dropna(how='any', subset=['Violation Location', 'Violation County']) #Fill na of these columns using respective max values # cols = ['Vehicle Body Type','Vehicle Make','Violation County','Violation In Front Of Or Opposite'] # agg_expr = [mode(f.collect_list(col)).alias(col) for col in cols] # max_vals = df_new.agg(*agg_expr).collect()[0] # df_new = df_new.fillna({'Vehicle Body Type':max_vals['Vehicle Body Type'],'Vehicle Make':max_vals['Vehicle Make'],'Violation County':max_vals['Violation County'],'Violation In Front Of Or Opposite':max_vals['Violation In Front Of Or Opposite']}) df_new = df_new.dropna(how='any') #Renaming columns names = df_new.schema.names for name in names: df_new = df_new.withColumnRenamed(name, name.replace(" ", "_")) #Mapping violation location df_new = df_new.withColumn('Violation_Location', regexp_replace('Violation_Location', 'KINGS', 'K'))\ .withColumn('Violation_Location', regexp_replace('Violation_Location', 'KING', 'K'))\ .withColumn('Violation_Location', regexp_replace('Violation_Location', 'QUEEN', 'Q'))\ .withColumn('Violation_Location', regexp_replace('Violation_Location', 'QU', 'Q'))\ .withColumn('Violation_Location', regexp_replace('Violation_Location', 'NEWY', 'NY'))\ .withColumn('Violation_Location', regexp_replace('Violation_Location', 'NEW Y', 'NY'))\ .withColumn('Violation_Location', regexp_replace('Violation_Location', 'MAN', 'NY'))\ .withColumn('Violation_Location', regexp_replace('Violation_Location', 'MH', 'NY'))\ .withColumn('Violation_Location', regexp_replace('Violation_Location', 'BRONX', 'BX')) ############################################################################## TRAINING PIPELINE ####################################################################################33 #Label encoding pipeline #Split the data train, test = df_new.randomSplit([0.93, 0.07]) indexers = [ StringIndexer(inputCol=column,
#Turning Text into Tables """ - remove punctuation and numbers - tokenize (split into individual words) - remove stop words - apply the hashing trick - convert to TF-IDF representation. """ # Import the necessary functions from pyspark.sql.functions import regexp_replace from pyspark.ml.feature import Tokenizer # Remove punctuation (REGEX provided) and numbers wrangled = sms.withColumn('text', regexp_replace(sms.text, '[_():;,.!?\\-]', ' ')) wrangled = wrangled.withColumn('text', regexp_replace(wrangled.text, '[0-9]', ' ')) # Merge multiple spaces wrangled = wrangled.withColumn('text', regexp_replace(wrangled.text, ' +', ' ')) # Split the text into words wrangled = Tokenizer(inputCol='text', outputCol='words').transform(wrangled) wrangled.show(4, truncate=False) ######################################################################################### #Stop words and hashing
F.concat( F.lit('<dccon> '), F.regexp_extract(col, r'src="[^?]*\?no=([^"]+)"', 1), F.lit(' '), F.regexp_extract(col, r'title="([^"]*)"', 1))).otherwise(F.col(col))) d2c_df = df.selectExpr('gallery_id', 'title', 'author', 'EXPLODE(comments) as comment') d2c_df = d2c_df.selectExpr('gallery_id', 'title', 'author', 'comment.contents as comment', 'comment.author as comm_author')\ .filter(F.col('author') != F.col('comm_author'))\ .select('gallery_id', 'title', 'comment') d2c_df = d2c_df.filter((~F.col('comment').startswith('<div')) & (~F.col('comment').isNull())) d2c_df = d2c_df.withColumn('title', F.regexp_replace('title', r'[\s\n\t]+', ' ')) d2c_df = d2c_df.withColumn('comment', F.regexp_replace('comment', r'[\s\n\t]+', ' ')) d2c_df = dccon_parse(d2c_df, 'comment') d2c_df = d2c_df.selectExpr('''CONCAT("text:", gallery_id, "¶", title, '\t', 'labels:', comment, '\t', "episode:done") AS episode ''' ) d2c_df = d2c_df.distinct() c2c_df = df.selectExpr( 'gallery_id', 'id as document_id', 'author as document_author', 'EXPLODE(comments) AS comment')\ .select('gallery_id', 'document_id', 'document_author', 'comment.author', 'comment.contents', 'comment.created_at', F.coalesce('comment.parent_id', 'comment.id').alias('root_id')) c2c_df = c2c_df.filter((~F.col('contents').startswith('<div'))
df = dfparquet.withColumn("year",sf.split("createdDate","\-")[0]) \ .withColumn("month",sf.split("createdDate","\-")[1]) \ .withColumn("day",sf.split((sf.split((sf.split("createdDate","\-")[2]),"T")[0])," ")[0]) dfbaseData = df.select([col for col in df.columns]) dfjoin = dfbaseData.join(dfrmi,(dfbaseData.loanApplicationId == dfrmi.loan_application_id) & \ (sf.unix_timestamp(dfbaseData.createdDate) - sf.unix_timestamp(dfrmi.date_created) >= 0),'left_outer') \ .join(dfrmsNS,(dfrmi.id == dfrmsNS.input_id),'left_outer') \ .select(dfbaseData.id \ ,dfbaseData.applicantId \ ,dfbaseData.applicationSource \ ,dfbaseData.mvpApplicantId \ ,dfbaseData.loanApplicationId \ # ,dfbaseData.mvpLoanApplicationId \ ,sf.regexp_replace(dfbaseData.createdDate,"T"," ").cast(TimestampType()).alias("telesignTimestampUTC") \ ,dfbaseData.createdDatePT.alias("telesignTimestampPT") \ ,dfrmi.id.alias("rmiId") \ ,dfrmi.date_created.alias("riskTimestampPT") \ ,dfrmsNS.score_type.alias("scoreType") \ ,dfbaseData.year \ ,dfbaseData.month \ ,dfbaseData.day) dfrankedId = dfjoin.withColumn("row_num", sf.row_number().over(Window.partitionBy("loanApplicationId","telesignTimestampPT").orderBy(sf.desc("riskTimestampPT"),sf.desc("scoreType")))) \ .where(sf.col("row_num") == 1) \ .select(dfjoin["*"]) dfsplitColDevice = dfbaseData.select(sf.col("id").alias("id") ,sf.col("device_info_imei").alias("imei") ,sf.col("device_info_make").alias("make")
taxiTripsRaw = sparkSession.read.csv(path=filePath, header=True, schema=schemaTaxiTrips, timestampFormat="MM/dd/yyyy hh:mm:ss a", mode="DROPMALFORMED") # Limpieza de los datos taxiTrips = taxiTripsRaw.select( "trip_id", "taxi_id", "trip_start_timestamp", "trip_end_timestamp", taxiTripsRaw["trip_seconds"].astype('integer').alias("trip_seconds"), taxiTripsRaw["trip_miles"].astype('integer').alias("trip_miles"), "pickup_census_tract", "dropoff_census_tract", taxiTripsRaw["pickup_community_area"].astype('integer').alias( "pickup_community_area"), taxiTripsRaw["dropoff_community_area"].astype('integer').alias( "dropoff_community_area"), F.regexp_replace(taxiTripsRaw["fare"], '[\$,)]', '').astype('double').alias("fare"), F.regexp_replace(taxiTripsRaw["tips"], '[\$,)]', '').astype('double').alias("tips"), F.regexp_replace(taxiTripsRaw["tolls"], '[\$,)]', '').astype('double').alias("tolls"), F.regexp_replace(taxiTripsRaw["extras"], '[\$,)]', '').astype('double').alias("extras"), F.regexp_replace(taxiTripsRaw["trip_total"], '[\$,)]', '').astype('double').alias("trip_total"), "payment_type", "company", "pickup_centroid_latitude", "pickup_centroid_longitude", "pickup_centroid_location", "dropoff_centroid_latitude", "dropoff_centroid_longitude", "dropoff_centroid_location", F.year(taxiTripsRaw["trip_start_timestamp"]).alias("year"), F.month(taxiTripsRaw["trip_start_timestamp"]).alias("month")) # Escritura de los datos transformados en S3 (AWS) o HDFS (local), particionados por año y mes
import pyspark.sql.functions as f from matplotlib import pyplot as plt from pyspark.sql.types import StructType, StringType, DoubleType, IntegerType, BooleanType, StructField, LongType, DateType,TimestampType, FloatType from pyspark.ml.classification import NaiveBayes, NaiveBayesModel, LogisticRegression, LogisticRegressionModel from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator from pyspark.ml.linalg import Vectors, VectorUDT from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder from pyspark.ml import Pipeline # COMMAND ---------- bids_train = spark.read.parquet("s3://rtl-databricks-datascience/lpater/processed_data/bids_train.parquet/") # COMMAND ---------- bids_train = bids_train.withColumn('new_deal_id', f.regexp_replace('deal_id',"\.|18ff3",""))\ .drop('deal_id')\ .withColumnRenamed('new_deal_id','deal_id') bids_train.cache() # COMMAND ---------- deal_ids = bids_train.select("deal_id").distinct() deal_ids_list = list(deal_ids.select("deal_id").toPandas()["deal_id"]) # COMMAND ---------- for deal_id in deal_ids_list: temp_bids = bids_train.filter(bids_train.deal_id==deal_id).select("max_bid")
import pyspark.sql.functions as F from pyspark.sql.functions import explode import sqlite3 import pandas spark = SparkSession.builder.appName('move_ru parse').getOrCreate() filePath = sys.argv[1] + '/result_01.csv' conn = sqlite3.connect('movedatabase.db') # загрузка данных из файла df = spark.read.csv(filePath, inferSchema=True, header=False) # очистка полученных данных split_column = F.split(df['_c2'], ' ') df = df.withColumnRenamed('_c0', 'flat_id') \ .withColumn('city_type', F.regexp_replace(split_column.getItem(2), r'[^а-я]', '')) \ .withColumn('city', F.split(df['_c2'], '/').getItem(1)) \ .withColumn('rooms', F.split(df['_c5'], ':').getItem(2)) \ .withColumn('m2', F.split(df['_c5'], ' ').getItem(2)) \ .withColumn('price', F.regexp_replace(F.col('_c1'), r'\D', '')) \ .withColumn('price_m', F.round((F.col('price') / F.col('m2')), 0)) \ .withColumn('m2_room', F.regexp_replace(F.split(df['_c5'], ':').getItem(1), r'м2 Комнат', '')) \ .withColumn('floor', F.split(df['_c5'], ' ').getItem(0)) \ .withColumn('region', split_column.getItem(0)) \ .withColumn('highway', F.split(df['_c3'], ' ').getItem(0)) \ .withColumn('mkad_km', F.split(df['_c4'], ' ').getItem(0)) \ .withColumnRenamed('_c6', 'update_date') \ .drop('_c1', '_c2', '_c3', '_c4', '_c5') # .write.csv(sys.argv[1]+'/result_clean', header = True) # запись полученных данных в базу данных
.withColumn("day",sf.split((sf.split((sf.split("createdDate","\-")[2]),"T")[0])," ")[0]) for colm in df.columns: if colm.startswith("resultcode_"): df = df.withColumn(colm, sf.concat(sf.lit(colm + "^"), sf.col(colm))) df_resultcode = df.withColumn( "resultcodeArray", sf.array([col for col in df.columns if col.startswith("resultcode_")])) dfexplode = df_resultcode.select(sf.col("id"),sf.col("mvpLoanApplicationId"),sf.col("loanApplicationId"), \ sf.col("clientID"),sf.col("mvpClientID"),sf.col("createdDatePT"),sf.col("year"),sf.col("month"),sf.col("day") \ ,sf.explode_outer("resultcodeArray").alias("resultcodes") \ ).where(sf.col("resultcodes").isNotNull()) dfsplitCol = dfexplode.withColumn("result_key",sf.regexp_replace(sf.split("resultcodes","\^")[0],"_",".")) \ .withColumn("result_message",sf.split("resultcodes","\^")[1]) \ .drop("resultcodes") dfbaseData = df.select( [col for col in df.columns if not col.startswith("resultcode_")]) dfjoin = dfbaseData.join(dfrmi,(dfbaseData.loanApplicationId == dfrmi.loan_application_id) & \ (sf.unix_timestamp(dfbaseData.createdDatePT) - sf.unix_timestamp(dfrmi.date_created) >= 0),'left_outer') \ .join(dfrmsNS,(dfrmi.id == dfrmsNS.input_id),'left_outer') \ .select(dfbaseData.id, \ dfbaseData.mvpLoanApplicationId, \ dfbaseData.loanApplicationId, \ dfbaseData.clientID, \ dfbaseData.mvpClientID, \ sf.regexp_replace(dfbaseData.createdDate,"T"," ").cast(TimestampType()).alias("idologyTimestampUTC"), \
srcfilePathParquetGold = "s3://" + bucket + "/" + gold_path + vendor + "/timestamp_date=" + year + "-" + month + "-" + day + "" srcfilePathParquet = "s3://" + bucket + "/" + enriched_path + vendor + "/Parquet/year=" + year + "/month=" + month + "/day=" + day + "" tgtfilePathAudits = "s3://" + bucket + "/" + enriched_path + "Audits/" + vendor + "/" dfparquetGold = sparkSession.read.format("parquet").load( srcfilePathParquetGold) dfparquet = sparkSession.read.format("parquet").load(srcfilePathParquet) dfparquetGold = dfparquetGold.withColumn("gold_date_created_utc",sf.from_unixtime(dfparquetGold.timestamp/1000,'YYYY-MM-dd').substr(1, 7)) \ # .where(dfparquetGold.successful == "true") dfparquet = dfparquet.withColumn( "enriched_date_created_utc", sf.regexp_replace(dfparquet.createdDate, "T", " ").substr(1, 7)) # dfparquetGoldStg = dfparquetGold.groupBy(dfparquetGold.date_created_utc).count().distinct().orderBy(sf.asc("date_created_utc")) # dfparquetGoldFinal = dfparquetGoldStg.select(dfparquetGoldStg.date_created_utc, dfparquetGoldStg.count) dfparquetGoldStg = dfparquetGold.groupBy( dfparquetGold.gold_date_created_utc).agg( countDistinct("_id").alias("gold_cnt_id")).orderBy( sf.asc("gold_date_created_utc")) dfparquetGoldFinal = dfparquetGoldStg.select( dfparquetGoldStg.gold_date_created_utc, dfparquetGoldStg.gold_cnt_id) # dfparquetGoldFinal.show(1, False) dfparquetStg = dfparquet.groupBy(dfparquet.enriched_date_created_utc).agg(
trim(df.extra_people)).withColumn("monthly_price", trim( df.monthly_price)).withColumn("price", trim(df.price)).withColumn( "security_deposit", trim(df.security_deposit)).withColumn( "weekly_price", trim(df.weekly_price)).withColumn("host_response_rate", trim(df.host_response_rate)) df = df.fillna('0', [ 'cleaning_fee', 'extra_people', 'monthly_price', 'price', 'security_deposit', 'weekly_price' ]) df = df.fillna(datetime.datetime.now().strftime("%M/%d/%Y"), ['last_scraped']) print("Done Filling zeros") print(df.columns) df = df.withColumn("cleaning_fee",regexp_replace(col("cleaning_fee"), "[^\d*\.?\d+]", ""))\ .withColumn("extra_people",regexp_replace(col("extra_people"), "[^\d*\.?\d+]", ""))\ .withColumn("monthly_price",regexp_replace(col("monthly_price"), "[^\d*\.?\d+]", ""))\ .withColumn("price",regexp_replace(col("price"), "[^\d*\.?\d+]", ""))\ .withColumn("security_deposit",regexp_replace(col("security_deposit"), "[^\d*\.?\d+]", ""))\ .withColumn("weekly_price",regexp_replace(col("weekly_price"), "[^\d*\.?\d+]", ""))\ .withColumn("weekly_price",regexp_replace(col("weekly_price"), "N/A", ""))\ .withColumn("host_response_rate",regexp_replace(col("host_response_rate"), "[^\d+]", "")) print("cleaning_fee Head") df.show() datasource1 = DynamicFrame.fromDF(df, glueContext, "nested") ## @type: ApplyMapping ## @args: [mapping = [("id", "long", "id", "long"), ("last_scraped", "string", "last_scraped", "timestamp"), ("host_id", "long", "host_id", "long"), ("host_name", "string", "host_name", "string"), ("host_since", "string", "host_since", "string"), ("host_response_time", "string", "host_response_time", "string"), ("host_response_rate", "string", "host_response_rate", "float"), ("host_neighbourhood", "string", "host_neighbourhood", "string"), ("host_total_listings_count", "double", "host_total_listings_count", "double"), ("host_identity_verified", "string", "host_identity_verified", "string"), ("neighbourhood_group_cleansed", "string", "neighbourhood_group_cleansed", "string"), ("city", "string", "city", "string"), ("state", "string", "state", "string"), ("zipcode", "long", "zipcode", "long"), ("country_code", "string", "country_code", "string"), ("country", "string", "country", "string"), ("latitude", "double", "latitude", "double"), ("longitude", "double", "longitude", "double"), ("property_type", "string", "property_type", "string"), ("room_type", "string", "room_type", "string"), ("accommodates", "long", "accommodates", "long"), ("bathrooms", "double", "bathrooms", "double"), ("bedrooms", "double", "bedrooms", "double"), ("beds", "double", "beds", "double"), ("bed_type", "string", "bed_type", "string"), ("square_feet", "double", "square_feet", "double"), ("price", "string", "price", "float"), ("weekly_price", "string", "weekly_price", "float"), ("monthly_price", "string", "monthly_price", "float"), ("security_deposit", "string", "security_deposit", "float"), ("cleaning_fee", "string", "cleaning_fee", "float"), ("guests_included", "long", "guests_included", "long"), ("extra_people", "string", "extra_people", "float"), ("number_of_reviews", "long", "number_of_reviews", "long"), ("first_review", "string", "first_review", "timestamp"), ("last_review", "string", "last_review", "timestamp"), ("review_scores_value", "double", "review_scores_value", "double"), ("cancellation_policy", "string", "cancellation_policy", "string")], transformation_ctx = "applymapping1"] ## @return: applymapping1 ## @inputs: [frame = datasource0]
return SparseVector(x.size, [(k, v) for (k, v) in zip(x.indices, x.values) if v >= t]) if __name__ == "__main__": argparser = argparse.ArgumentParser(description='Cluster features') argparser.add_argument('-c', '--minCount', type=int, default=1.0) argparser.add_argument('-s', '--clusterSize', type=int, default=1) argparser.add_argument('indir', help='Input directory') argparser.add_argument('outdir', help='Output directory') args = argparser.parse_args() spark = SparkSession.builder.appName('Cluster Features').getOrCreate() df = spark.read.load(args.indir) raw = df.filter(col('size') >= args.clusterSize) \ .select('cluster', 'size', regexp_replace('text', u'\xad\s*', '').alias('text')) raw.cache() tok = RegexTokenizer(inputCol='text', outputCol='terms', gaps=False, pattern='\w+') \ .transform(raw) counts = CountVectorizer(inputCol='terms', outputCol='counts', minDF=2.0) \ .fit(tok).transform(tok) mergeCounts = udf(lambda va, size: threshold_sparse(scale_sparse(reduce(add_sparse, va), 1.0/size), args.minCount), VectorUDT()) res = counts.groupBy('cluster', 'size') \ .agg(mergeCounts(collect_list('counts'), 'size').alias('counts')) # lda = LDA(k=2, featuresCol='counts', seed=1, optimizer='em') # model = lda.fit(res)
def text_formatting(spark): """ Extract formatting features from the text of a post Args: spark (SparkSession): used to run queries and commands Returns: DataFrame: With columns [ (post)_Id, #codelines, #html_blocks, #headings, #referencelist, #quotes, #codeblocks, #themebreaks, #codespans, #references, #links, #inline_images, #mail_addresses, #emphasis, #strong ] """ # Replaces formatted text that has already been processed FILLER = 'x' # Parser helper column COLNAME = 'processed_text' COL = col(COLNAME) # Data loading post_history_df = spark.read.parquet("/user/***REMOVED***/StackOverflow/PostHistory.parquet") \ .select(['_PostId', '_Text', '_PostHistoryTypeId']) \ .filter(col('_PostHistoryTypeId') == 2) \ .drop('_PostHistoryTypeId') post_df = spark.read.parquet('/user/***REMOVED***/StackOverflow/Posts.parquet') \ .select(['_Id', '_PostTypeId']) \ .filter(col('_PostTypeId') == 1) \ .drop("_PostTypeId") df = post_history_df.join(post_df, post_df['_Id'] == post_history_df['_PostId']) # Count lines and words of the formatted text df = df.withColumn('#lines', size(split(col('_Text'), r'\n'))) \ .withColumn('#words', size(split(col('_Text'), r'\s+'))) # BLOCK ELEMENTS # Count code lines df = df.withColumn(COLNAME, split(col('_Text'), regex.CODE_BLOCK_RE)) \ .withColumn('#codelines', size(COL) - 1) \ .withColumn('codeline_ratio', col('#codelines') / col('#lines')) \ .withColumn(COLNAME, array_join(COL, FILLER)) # Count HTML blocks df = df.withColumn(COLNAME, split(COL, regex.HTML_BLOCK_RE)) \ .withColumn('#html_blocks', size(COL) - 1) \ .withColumn(COLNAME, array_join(COL, FILLER)) # # Count headings (1/2) df = df.withColumn(COLNAME, split(COL, regex.SETEXT_HEADING_RE)) \ .withColumn('#headings', size(COL) - 1) \ .withColumn(COLNAME, array_join(COL, FILLER)) # Count reference list df = df.withColumn(COLNAME, split(COL, regex.REFERENCE_LIST_RE)) \ .withColumn('#referencelist', size(COL) - 1) \ .withColumn(COLNAME, array_join(COL, FILLER)) # Count quotes df = df.withColumn(COLNAME, split(COL, regex.QUOTE_RE)) \ .withColumn('#quotes', size(COL) - 1) \ .withColumn(COLNAME, array_join(COL, FILLER)) # Count headings (2/2) df = df.withColumn(COLNAME, split(COL, regex.HEADING_RE)) \ .withColumn('#headings', size(COL) - 1 + col('#headings')) \ .withColumn('heading_ratio', col('#headings') / col('#lines')) \ .withColumn(COLNAME, array_join(COL, FILLER)) # Count code blocks df = df.withColumn(COLNAME, split(COL, regex.FENCED_CODE_RE)) \ .withColumn('#codeblocks', size(COL) - 1) \ .withColumn('codeblock_ratio', col('#codeblocks') / col('#lines')) \ .withColumn(COLNAME, array_join(COL, FILLER)) # Count thematic break df = df.withColumn(COLNAME, split(COL, regex.THEME_BREAK_RE)) \ .withColumn('#themebreaks', size(COL) - 1) \ .withColumn('themebreak_ratio', col('#themebreaks') / col('#lines')) \ .withColumn(COLNAME, array_join(COL, FILLER)) # INLINE ELEMENTS # Count codespans df = df.withColumn(COLNAME, split(COL, regex.CODESPAN_RE)) \ .withColumn('#codespans', size(COL) - 1) \ .withColumn('codespan_ratio', col('#codespans') / col('#words')) \ .withColumn(COLNAME, array_join(COL, FILLER)) # Remove markdown escapes df = df.withColumn(COLNAME, regexp_replace(COL, regex.ESCAPE_RE, FILLER)) # Count references (1/2) df = df.withColumn(COLNAME, split(COL, regex.REFERENCE_RE)) \ .withColumn('#references', size(COL) - 1) \ .withColumn(COLNAME, array_join(COL, FILLER)) # Count links (1/2) df = df.withColumn(COLNAME, split(COL, regex.LINK_RE)) \ .withColumn('#links', size(COL) - 1) \ .withColumn(COLNAME, array_join(COL, FILLER)) # Count inline images df = df.withColumn(COLNAME, split(COL, regex.INLINE_IMAGE_RE)) \ .withColumn('#inline_images', size(COL) - 1) \ .withColumn(COLNAME, array_join(COL, FILLER)) # # Count references (2/2) df = df.withColumn(COLNAME, split(COL, regex.SHORT_REFERENCE_RE)) \ .withColumn('#references', size(COL) - 1 + col('#references')) \ .withColumn(COLNAME, array_join(COL, FILLER)) # Count links (2/2) df = df.withColumn(COLNAME, split(COL, regex.AUTOLINK_RE)) \ .withColumn('#links', size(COL) - 1 + col('#links')) \ .withColumn(COLNAME, array_join(COL, FILLER)) # Count mails df = df.withColumn(COLNAME, split(COL, regex.AUTOMAIL_RE)) \ .withColumn('#mail_addresses', size(COL) - 1) \ .withColumn(COLNAME, array_join(COL, FILLER)) # Remove line breaks, html, stand-alone * or _ df = df.withColumn(COLNAME, regexp_replace(COL, regex.LINE_BREAK_RE, FILLER)) df = df.withColumn(COLNAME, regexp_replace(COL, regex.HTML_RE, FILLER)) df = df.withColumn(COLNAME, regexp_replace(COL, regex.NOT_STRONG_RE, FILLER)) # Count strong & emphasis df = df.withColumn(COLNAME, split(COL, regex.EM_STRONG_RE)) \ .withColumn('#emphasis', size(COL) - 1) \ .withColumn('#strong', size(COL) - 1) \ .withColumn(COLNAME, array_join(COL, FILLER)) df = df.withColumn(COLNAME, split(COL, regex.STRONG_EM_RE)) \ .withColumn('#emphasis', size(COL) - 1 + col('#emphasis')) \ .withColumn('#strong', size(COL) - 1 + col('#strong')) \ .withColumn(COLNAME, array_join(COL, FILLER)) df = df.withColumn(COLNAME, split(COL, regex.STRONG_EM3_RE)) \ .withColumn('#emphasis', size(COL) - 1 + col('#emphasis')) \ .withColumn('#strong', size(COL) - 1 + col('#strong')) \ .withColumn(COLNAME, array_join(COL, FILLER)) df = df.withColumn(COLNAME, split(COL, regex.STRONG_RE)) \ .withColumn('#strong', size(COL) - 1 + col('#strong')) \ .withColumn(COLNAME, array_join(COL, FILLER)) df = df.withColumn(COLNAME, split(COL, regex.EMPHASIS_RE)) \ .withColumn('#emphasis', size(COL) - 1 + col('#emphasis')) \ .withColumn(COLNAME, array_join(COL, FILLER)) df = df.withColumn(COLNAME, split(COL, regex.EM_STRONG2_RE)) \ .withColumn('#emphasis', size(COL) - 1 + col('#emphasis')) \ .withColumn('#strong', size(COL) - 1 + col('#strong')) \ .withColumn(COLNAME, array_join(COL, FILLER)) df = df.withColumn(COLNAME, split(COL, regex.STRONG2_RE)) \ .withColumn('#strong', size(COL) - 1 + col('#strong')) \ .withColumn(COLNAME, array_join(COL, FILLER)) df = df.withColumn(COLNAME, split(COL, regex.EMPHASIS2_RE)) \ .withColumn('#emphasis', size(COL) - 1 + col('#emphasis')) \ .withColumn(COLNAME, array_join(COL, FILLER)) df = df.withColumn('emphasis_ratio', col('#emphasis') / col('#words')) \ .withColumn('strong_ratio', col('#strong') / col('#words')) # Remove unnecessary columns, including parser helper column df = df.drop('_Text', '_PostHistoryTypeId', '_PostId', '#lines', '#words', COLNAME) return df
def prepareDatasets(sc, spark): buisHeader = ['business_id', 'name', 'neighborhood', 'address', 'city', 'state', 'postal_code', 'latitude', 'longitude', 'stars', 'review_count', 'is_open', 'categories'] buis = sc.textFile(datapath+'yelp_business.csv', use_unicode=False) buis = buis.filter(lambda row: not row.startswith('business_id,name'))\ .map(lambda row: re.findall(r'(?:[^,"]|"(?:\\.|[^"])*")+', row.replace(',,', ', ,')))\ .map(lambda row: map(lambda x: x.replace('"', ''), row))\ .map(lambda row: dict(zip(buisHeader, row)))\ .filter(lambda row: row['business_id'] and row['longitude'] and row['latitude'])\ .filter(lambda row: row['business_id'].strip() and row['longitude'].strip() and row['latitude'].strip())\ .toDF() buis = buis.select('business_id', 'name', 'city', 'state', 'postal_code', 'categories', buis['latitude'].cast('float'), buis['longitude'].cast('float'), buis['stars'].cast('float'), buis['review_count'].cast('int'), buis['is_open'].cast('int'))\ .dropna(how='any', subset=['business_id','longitude', 'latitude']) def reviews_mapper(index, lines): import csv reader = csv.reader(lines) if index==0: lines.next() for row in reader: if len(row) == 9 and len(row[1])==22: yield row reviewsHeader = ["review_id","user_id","business_id","stars","date","text","useful","funny","cool"] reviews = sc.textFile(datapath+'yelp_review.csv', use_unicode=False)\ .mapPartitionsWithIndex(reviews_mapper)\ .map(lambda x: dict(zip(reviewsHeader, x)))\ .toDF() reviews = reviews.select( "review_id", "user_id", "business_id", "text", reviews["stars"].cast('float'), reviews["date"].cast('date'), reviews["useful"].cast('int'), reviews["funny"].cast('int'), reviews["cool"].cast('int'))\ .filter(reviews.text.isNotNull())\ .filter(reviews.business_id.isNotNull()) reviews = reviews.alias('a').join(buis.alias('b'), sf.col('b.business_id') == sf.col('a.business_id'))\ .select('b.*','a.text') #,'a.user_id') reviews = reviews.where( 'longitude > {:f} and longitude < {:f} and latitude > {:f} and latitude < {:f}'\ .format(westAMER, eastAMER, southAMER, northAMER) ).cache() id_text = reviews.select('business_id', 'text')\ .groupBy('business_id').agg(sf.concat_ws(' ', sf.collect_list("text")).alias('text_concat')) reviews = reviews.drop(reviews.text)\ .select('business_id','categories','state', 'stars')\ .alias('a').join(id_text.alias('b'), sf.col('b.business_id') == sf.col('a.business_id'))\ .select('a.*','b.text_concat')\ .distinct()\ .withColumnRenamed('text_concat', 'text') # some data cleansing: reviews = reviews.withColumn('text', sf.regexp_replace(reviews.text, '\\/', '/')) def cleanse(text): re_punc = re.compile('[' + re.escape(punctuation) + '0-9\\n\\t\\r]') re_spc = re.compile('[ ]+') # get rid of extra spaces return re_spc.sub(' ', re_punc.sub(" ", text)) cleanser = sf.udf(lambda x: cleanse(x)) reviews = reviews.withColumn('text', cleanser('text')) # tokinizing and removing stop words: import pyspark.ml.feature as sparkml from pyspark.ml import Pipeline tokenizer = sparkml.Tokenizer(inputCol="text", outputCol="words") swremover = sparkml.StopWordsRemover(inputCol='words', outputCol='words_clean') pipeline = Pipeline(stages=[tokenizer, swremover]) reviews = pipeline.fit(reviews).transform(reviews) reviews = reviews.drop('text', 'words') return reviews.cache()
def test_auto_mapper_fhir_patient_resource( spark_session: SparkSession) -> None: # Arrange spark_session.createDataFrame( [ (1, "Qureshi", "Imran", "1970-01-01", "female"), (2, "Vidal", "Michael", "1970-02-02", None), ], ["member_id", "last_name", "first_name", "date_of_birth", "my_gender"], ).createOrReplaceTempView("patients") source_df: DataFrame = spark_session.table("patients") df = source_df.select("member_id") df.createOrReplaceTempView("members") # Act mapper = AutoMapper(view="members", source_view="patients", keys=["member_id"]).complex( Patient( id_=FhirId(A.column("member_id")), birthDate=A.date(A.column("date_of_birth")), name=FhirList([ HumanName(use=NameUseCode("usual"), family=A.column("last_name")) ]), gender=A.if_not_null( A.column("my_gender"), AdministrativeGenderCode( A.column("my_gender"))), )) assert isinstance(mapper, AutoMapper) sql_expressions: Dict[str, Column] = mapper.get_column_specs( source_df=source_df) for column_name, sql_expression in sql_expressions.items(): print(f"{column_name}: {sql_expression}") result_df: DataFrame = mapper.transform(df=df) # Assert assert len(sql_expressions) == 5 assert str(sql_expressions["id"]) == str( substring(regexp_replace(col("b.member_id"), r"[^A-Za-z0-9\-\.]", "-"), 0, 63).cast("string").alias("id")) assert str(sql_expressions["resourceType"]) == str( lit("Patient").cast("string").alias("resourceType")) assert str(sql_expressions["birthDate"]) == str( coalesce( to_date(col("b.date_of_birth"), "y-M-d"), to_date(col("b.date_of_birth"), "yyyyMMdd"), to_date(col("b.date_of_birth"), "M/d/y"), ).cast("date").alias("birthDate")) # assert str(sql_expressions["name"]) == str( # filter( # array( # struct( # lit("usual").alias("use"), # col("b.last_name").alias("family"), # ) # ), lambda x: x.isNotNull() # ).alias("name") # ) # assert str(sql_expressions["gender"]) == str( # when(col("b.my_gender").isNull(), # None).otherwise(col("b.my_gender")).alias("gender") # ) result_df.printSchema() result_df.show() assert (result_df.where("member_id == 1").selectExpr( "name[0].use").collect()[0][0] == "usual") assert (result_df.where("member_id == 1").selectExpr( "name[0].family").collect()[0][0] == "Qureshi") assert (result_df.where("member_id == 2").selectExpr( "name[0].use").collect()[0][0] == "usual") assert (result_df.where("member_id == 2").selectExpr( "name[0].family").collect()[0][0] == "Vidal")
lpad(lit("HELLO"), 3, " ").alias("lp"), rpad(lit("HELLO"), 10, " ").alias("rp")).show(2) # -- in SQL # SELECT # ltrim(' HELLLOOOO '), # rtrim(' HELLLOOOO '), # trim(' HELLLOOOO '), # lpad('HELLOOOO ', 3, ' '), # rpad('HELLOOOO ', 10, ' ') # FROM dfTable # regexp from pyspark.sql.functions import regexp_replace regex_string = "BLACK|WHITE|RED|GREEN|BLUE" df.select( regexp_replace(col("Description"), regex_string, "COLOR").alias("color_clean"), col("Description")).show(2) # -- in SQL # SELECT # regexp_replace(Description, 'BLACK|WHITE|RED|GREEN|BLUE', 'COLOR') as # color_clean, Description # FROM dfTable # replace strings from pyspark.sql.functions import translate df.select(translate(col("Description"), "LEET", "1337"),col("Description"))\ .show(2) # -- in SQL # SELECT translate(Description, 'LEET', '1337'), Description FROM dfTable # regexp_extract from pyspark.sql.functions import regexp_extract
def process(rdd): start = time.time() global accuracy global completed # Get the singleton instance of SparkSession spark = getSparkSessionInstance(rdd.context.getConf()) # Convert RDD[String] to RDD[Row] to DataFrame rowRdd = rdd.map( lambda x: Row(Summons_Number=str(x[0]), Registration_State=str(x[2]), Plate_Type=str(x[3]), Violation_Code=str(x[5]), Vehicle_Body_Type=str(x[6]), Vehicle_Make=str(x[7]), Issuing_Agency=str(x[8]), Street_Code1=str(x[9]), Street_Code2=str(x[10]), Street_Code3=str(x[11]), Violation_County=str(x[13]), Issuer_Precinct=str(x[14]), Issuer_Command=str(x[16]), Issuer_Squad=str(x[17]), Violation_In_Front_Of_Or_Opposite=str(x[21]), Issue_Date=str(x[4]), Violation_Time=str(x[18]), Violation_Location=str(x[20]))) df = spark.createDataFrame(rowRdd) ############################################## PREPROCESSING ################################## #Splitting the issue date into month,year,day df_new = df.withColumn('Month', split('Issue_Date', '/')[0]).withColumn( 'Year', split('Issue_Date', '/')[2]).withColumn( 'Day', day_udf(col('Issue_Date'))).withColumn( 'Time', time_udf(col('Violation_Time'))) #converting the columns into integers df_new = df_new.withColumn("Year", df_new["Year"].cast( IntegerType())).withColumn("Month", df_new["Month"].cast( DoubleType())).withColumn("Day", df_new["Day"].cast( DoubleType())).withColumn("Time", df_new["Time"].cast(DoubleType())) #Removing outliers and some filtering df_new = df_new.drop( *['Issue_Date', 'Violation_Time', 'Year', 'Issuer_Squad']) #Filling na df_new = df_new.fillna({'Time': 3}) #Removing na locaions of violation location and violation count df_new = df_new.dropna(how='any', subset=['Violation_Location', 'Violation_County']) df_new = df_new.dropna(how='any') #Mapping violation location df_new = df_new.withColumn('Violation_Location', regexp_replace('Violation_Location', 'KINGS', 'K'))\ .withColumn('Violation_Location', regexp_replace('Violation_Location', 'KING', 'K'))\ .withColumn('Violation_Location', regexp_replace('Violation_Location', 'QUEEN', 'Q'))\ .withColumn('Violation_Location', regexp_replace('Violation_Location', 'QU', 'Q'))\ .withColumn('Violation_Location', regexp_replace('Violation_Location', 'NEWY', 'NY'))\ .withColumn('Violation_Location', regexp_replace('Violation_Location', 'NEW Y', 'NY'))\ .withColumn('Violation_Location', regexp_replace('Violation_Location', 'MAN', 'NY'))\ .withColumn('Violation_Location', regexp_replace('Violation_Location', 'MH', 'NY'))\ .withColumn('Violation_Location', regexp_replace('Violation_Location', 'BRONX', 'BX')) # df_new.show() ################################################################################################ # Prediction using saved model df_r1 = model.transform(df_new) # df_r1.show() # df_r1.dropna() df_with_cat = df_r1.withColumn("correct", udfaccuracy("label", "prediction")) # df_with_cat.show() correct_array = df_with_cat.select( "label", "prediction").rdd.map(lambda r: int(r[0]) - int(r[1]) == 0).collect() num = len(correct_array) temp = sum(correct_array) completed += num accuracy += temp end = time.time() print("Labels correct till now:{}/{}".format(accuracy, completed)) print("Completed batch of {} in {}sec".format(num, end - start)) df_r1.show()
def applyModel(fileName, loadModelName, outlierPercentile = 100): sc = SparkContext( 'local', 'pyspark') sqlContext = SQLContext(sc) ######### # load data ######### data = sc.textFile(fileName) #extract header and remove it header = data.first() data = data.filter(lambda x:x !=header).cache() header = header.split('\t') #parse data data = data.map(lambda x : x.split('\t')) ######### # prepare features ######### df = sqlContext.createDataFrame(data, header) df = (df.withColumn("ADLOADINGTIME",func.regexp_replace('ADLOADINGTIME', 'null', '0').cast('float')) .withColumn("TIMESTAMP",func.regexp_replace('TIMESTAMP', 'null', '0').cast('int')) .withColumn("GEOIP_LAT",func.regexp_replace('GEOIP_LAT', 'null', '0').cast('int')) .withColumn("GEOIP_LNG",func.regexp_replace('GEOIP_LNG', 'null', '0').cast('int')) .withColumn("HOSTWINDOWHEIGHT",func.regexp_replace('HOSTWINDOWHEIGHT', 'null', '0').cast('int')) .withColumn("HOSTWINDOWWIDTH",func.regexp_replace('HOSTWINDOWWIDTH', 'null', '0').cast('int')) .withColumn("TOPMOSTREACHABLEWINDOWHEIGHT",func.regexp_replace('TOPMOSTREACHABLEWINDOWHEIGHT', 'null', '0').cast('int')) .withColumn("TOPMOSTREACHABLEWINDOWWIDTH",func.regexp_replace('TOPMOSTREACHABLEWINDOWWIDTH', 'null', '0').cast('int')) ) thr = np.percentile(df.select("ADLOADINGTIME").rdd.collect(), outlierPercentile) df = df.filter(func.col('ADLOADINGTIME') < thr) df = df.withColumn("TOPMOSTREACHABLEWINDOWAREA", func.col("TOPMOSTREACHABLEWINDOWHEIGHT")*func.col("TOPMOSTREACHABLEWINDOWWIDTH")) df = df.withColumn("INTENDENTISACTUALDEVICETYPE", (func.col("ACTUALDEVICETYPE")==func.col("INTENDEDDEVICETYPE")).cast('int')) df = df.withColumn("COMBINEDID", func.concat( func.col('ACCOUNTID'), func.col('CAMPAIGNID'), func.col('CREATIVEID'), func.col('SDK')) ) #df = df.withColumn("COMBINEDID", func.regexp_replace("COMBINEDID", '^$', 'NA')) df = df.withColumn("COMBINEDEXTERNALID", func.concat( func.regexp_replace('EXTERNALADSERVER', 'null', ''), func.regexp_replace('EXTERNALPLACEMENTID', 'null', ''), func.regexp_replace('EXTERNALSITEID', 'null', ''), func.regexp_replace('EXTERNALSUPPLIERID', 'null', '') )) #df = df.withColumn("COMBINEDEXTERNALID", func.regexp_replace("COMBINEDEXTERNALID", '^$', 'NA')) df = df.withColumn("PLATFORMCOMBINED", func.concat( func.regexp_replace('PLATFORM', 'null', ''), func.regexp_replace('PLATFORMVERSION', 'null', '') )) #df = df.withColumn("PLATFORMCOMBINED", func.regexp_replace("PLATFORMCOMBINED", '^$', 'NA')) df = df.withColumn("UA_OSCOMB", func.concat( func.regexp_replace('UA_OS', 'null', ''), func.regexp_replace('UA_OSVERSION', 'null', '') )) #df = df.withColumn("UA_OSCOMB", func.regexp_replace("UA_OSCOMB", '^$', 'NA')) df = df.withColumn("FILESJSON_SIZE", func.regexp_replace('FILESJSON', '[^,\d]', '') ) df = df.withColumn("FILESJSON_SIZE", func.regexp_replace('FILESJSON_SIZE', '^,', '') ) df = df.withColumn("FILESJSON_SIZE", func.regexp_replace('FILESJSON_SIZE', ',,', ',') ) udf = func.udf(lambda x: int(np.fromstring(x,dtype=int, sep=',').sum()), IntegerType()) df = df.withColumn("FILESJSON_SIZE", udf("FILESJSON_SIZE")) print('Loaded and prapared %d entries' % df.count()) ######### # keep only needed features ######### features = ['ADLOADINGTIME', 'PLACEMENTID', 'TIMESTAMP', 'CREATIVETYPE', 'UA_HARDWARETYPE', 'UA_VENDOR', 'UA_MODEL', 'UA_BROWSER', 'UA_BROWSERVERSION', 'FILESJSON', 'ERRORSJSON', 'TOPMOSTREACHABLEWINDOWAREA', 'FILESJSON_SIZE', 'COMBINEDID', 'COMBINEDEXTERNALID', 'PLATFORMCOMBINED', 'UA_OSCOMB', 'SDK', 'EXTERNALADSERVER' ] df = df.select(features) ######### # Convert categorical features to numerical ######### featuresCat = [ 'PLACEMENTID', 'CREATIVETYPE', 'UA_HARDWARETYPE', 'UA_VENDOR', 'UA_MODEL', 'UA_BROWSER', 'UA_BROWSERVERSION', 'FILESJSON', 'ERRORSJSON', 'COMBINEDID', 'COMBINEDEXTERNALID', 'PLATFORMCOMBINED', 'UA_OSCOMB', 'SDK', 'EXTERNALADSERVER' ] for i in range(len(featuresCat)): indexer = StringIndexer(inputCol=featuresCat[i], outputCol='_'+featuresCat[i]).setHandleInvalid("skip").fit(df) df = indexer.transform(df).drop(featuresCat[i]) writer = indexer._call_java("write") writer.overwrite().save("indexer_" + featuresCat[i]) featuresCat = [ '_' + featuresCat[i] for i in range(len(featuresCat))] features = featuresCat[:] features.append('TIMESTAMP') features.append('FILESJSON_SIZE') features.append('TOPMOSTREACHABLEWINDOWAREA') ######### # Assemble features ######### assembler = VectorAssembler( inputCols=features, outputCol="features") df = assembler.transform(df) ######### # Convert to labeled point ######### lp = (df.select(func.col("ADLOADINGTIME").alias("label"), func.col("features")) .map(lambda row: LabeledPoint(row.label, row.features))) lp.cache() ######### # Load trained model ######### model = RandomForestModel.load(sc, loadModelName) print('Model loaded!') predictions = model.predict(lp.map(lambda x: x.features)).collect() return predictions
from pyspark.sql.functions import lit, ltrim, rtrim, rpad, lpad, trim df.select( ltrim(lit(" HELLO ")).alias("ltrim"), rtrim(lit(" HELLO ")).alias("rtrim"), trim(lit(" HELLO ")).alias("trim"), lpad(lit("HELLO"), 3, " ").alias("lp"), rpad(lit("HELLO"), 10, " ").alias("rp")).show(2) # COMMAND ---------- from pyspark.sql.functions import regexp_replace regex_string = "BLACK|WHITE|RED|GREEN|BLUE" df.select( regexp_replace(col("Description"), regex_string, "COLOR").alias("color_clean"), col("Description")).show(2) # COMMAND ---------- from pyspark.sql.functions import translate df.select(translate(col("Description"), "LEET", "1337"),col("Description"))\ .show(2) # COMMAND ---------- from pyspark.sql.functions import regexp_extract extract_str = "(BLACK|WHITE|RED|GREEN|BLUE)" df.select(
def _transform_data(df_price, df_demanda, df_produccion, df_pinternac, df_subasta): """Transform original dataset. :param df: Input DataFrame. :param redes: Redes file that comes from Investigation Office :param entity_: Entity Zurich 'Z' or Another (BANC SABADELL 'BS') :return: Transformed DataFrame. """ # Correct Decimals by dots bad_columns = [ 'TOTAL_IMPORTACION_ES', 'TOTAL_PRODUCCION_ES', 'TOTAL_DEMANDA_NAC_ES', 'TOTAL_EXPORTACIONES_ES', 'TOTAL_DDA_ES', 'TOTAL_POT_IND_ES', 'TOTAL_PRODUCCION_POR', 'TOTAL_DEMANDA_POR' ] for i in bad_columns: df_demanda = (df_demanda.withColumn(i, regexp_replace( i, '\\.', '')).withColumn(i, regexp_replace(i, ',', '.').cast('float'))) bad_columns = [ 'HIDRAULICA_CONVENC', 'HIDRAULICA_BOMBEO', 'NUCLEAR', 'CARBON NACIONAL', 'CARBON_IMPO', 'CICLO_COMBINADO', 'FUEL_SIN_PRIMA', 'FUEL_PRIMA', 'REG_ESPECIAL' ] for i in bad_columns: df_produccion = (df_produccion.withColumn( i, regexp_replace(i, '\\.', '')).withColumn( i, regexp_replace(i, ',', '.').cast('float'))) # Estos son producción cero o importación cero df_produccion = df_produccion.fillna(0) df_demanda = df_demanda.fillna(0) # Date Variables df_price = df_price.select( *['ANIO', 'MES', 'DIA', 'HORA', 'PESPANIA', 'PPORTUGAL']) funct = udf(lambda x: x.zfill(2), StringType()) df_price = df_price.withColumn('DIA', funct(df_price['DIA'])) df_produccion = df_produccion.withColumn('DIA', funct(df_produccion['DIA'])) df_demanda = df_demanda.withColumn('DIA', funct(df_demanda['DIA'])) df_price = df_price.withColumn('MES', funct(df_price['MES'])) df_produccion = df_produccion.withColumn('MES', funct(df_produccion['MES'])) df_demanda = df_demanda.withColumn('MES', funct(df_demanda['MES'])) df_demanda = df_demanda.withColumn( 'DATE', concat(col('DIA'), lit('-'), col('MES'), lit('-'), col('ANIO'))) df_produccion = df_produccion.withColumn( 'DATE', concat(col('DIA'), lit('-'), col('MES'), lit('-'), col('ANIO'))) df_price = df_price.withColumn( 'DATE', concat(col('DIA'), lit('-'), col('MES'), lit('-'), col('ANIO'))) # Group By Day df_price = (df_price.groupby('DATE').agg({ 'PESPANIA': 'avg', 'PPORTUGAL': 'avg' }).withColumnRenamed('avg(PESPANIA)', 'PSPAIN').withColumnRenamed( 'avg(PPORTUGAL)', 'PPORTUGAL')) df_demanda = df_demanda.groupby('DATE').sum() df_produccion = df_produccion.fillna(0) df_produccion = df_produccion.groupby('DATE').sum() # SUBASTA df = df_price.join(df_subasta, how='left', on='DATE').fillna({'DUMMY': 0}) delete_var = ['ANIO', 'MES', 'DIA', 'HORA'] df_demanda = df_demanda.drop(*delete_var) df_produccion = df_produccion.drop(*delete_var) df = df.drop(*['ANIO', 'DIA', 'HORA']) df = df.join(df_demanda, how='left', on='DATE') df = df.join(df_produccion, how='left', on='DATE') # INTERPOLATE df = df.toPandas() df_pinternac = df_pinternac.interpolate(limit_direction='backward', method='nearest') df = pd.merge(df, df_pinternac, how='left', left_on='DATE', right_on='FECHA') del df['FECHA'] df['DATE'] = pd.to_datetime(df['DATE'], format='%d-%m-%Y') ''' for colm in ['TAVG', 'TMAX', 'TMIN']: df_weather_nor[colm] = ((df_weather_nor[colm] - 32)*5/9).map(int) # Farenheit to Celsius ''' # DUMMY VARS df = df.sort_values(by='DATE', ascending=True) dummy_var = [ 3, 5, 7, 10, 14, 15, 20, 25, 30, 45, 50, 55, 60, 65, 70, 75, 80, 85, 90 ] df.loc[df['DUMMY'].isin(['0', 0]), 'DUMMY'] = np.NaN for i in dummy_var: name = 'DUMMY_BACK_' + str(i) + '_DAY' df[name] = pd.Series(df['DUMMY'], index=df.index) rows = i df[name] = df[name].bfill(axis=0, limit=rows) df[name] = df[name].fillna(0) for i in dummy_var: name = 'DUMMY_FORW_' + str(i) + '_DAY' df[name] = pd.Series(df['DUMMY'], index=df.index) rows = i df[name] = df[name].ffill(axis=0, limit=rows) df[name] = df[name].fillna(0) df['DUMMY'] = df['DUMMY'].fillna(0) df = df.dropna(axis=0, how='any') # WORK DAY df['DATE'] = pd.to_datetime(df['DATE'], format='%d-%m-%Y') df['WEEKDAY'] = df['DATE'].dt.dayofweek df['MES'] = df['DATE'].dt.month df['WORKDAY'] = pd.Series(0, index=df.index) df.loc[df['WEEKDAY'].isin([0, 1, 2, 3, 4]), 'WORKDAY'] = 1 for i in STRING.feriados_spain: df.loc[df['DATE'] == i, 'WORKDAY'] = 0 del df['WEEKDAY'] # NULL PRICE df['NULL_PRICE'] = pd.Series(0, index=df.index) df.loc[df['DATE'].between('2013-03-28', '2013-04-02', inclusive=True), 'NULL_PRICE'] = 1 # SUMMER-WINTER df['SUMMER'] = pd.Series(0, index=df.index) df.loc[df['MES'].isin([7, 8]), 'SUMMER'] = 1 df['WINTER'] = pd.Series(0, index=df.index) df.loc[df['MES'].isin([12, 1]), 'WINTER'] = 1 del df['MES'] bool_cols = [ col for col in df if df[[col]].dropna().isin([0, 1]).all().values ] for i in df.drop(bool_cols + ['DATE'], axis=1).columns.values.tolist(): df[i] = df[i].map(float) df[i] = df[i].round(2) return df