def removePunctuation(column):

    no_punct = regexp_replace(column, "\p{Punct}", '')
    lowered = lower(no_punct)
    cleaned = trim(lowered)
    return cleaned

    """Removes punctuation, changes to lower case, and strips leading and trailing spaces.
def removePunctuation(column):
    """Removes punctuation, changes to lower case, and strips leading and trailing spaces.

    Note:
        Only spaces, letters, and numbers should be retained.  Other characters should should be
        eliminated (e.g. it's becomes its).  Leading and trailing spaces should be removed after
        punctuation is removed.

    Args:
        column (Column): A Column containing a sentence.

    Returns:
        Column: A Column named 'sentence' with clean-up operations applied.
    """
    #column_val = regexp_replace(column, "\p{Punct}", "")
    #return trim(lower(column_val))
    word = lower(trim(regexp_replace(regexp_replace(column, '[^\w\s]', ''),'_',''))).alias("word")
    return word
def removePunctuation(column):
    """Removes punctuation, changes to lower case, and strips leading and trailing spaces.
    Note:
        Only spaces, letters, and numbers should be retained.  Other characters should should be
        eliminated (e.g. it's becomes its).  Leading and trailing spaces should be removed after
        punctuation is removed.
    Args:
        column (Column): A Column containing a sentence.
    Returns:
        Column: A Column named 'sentence' with clean-up operations applied.
    """
    return (trim(regexp_replace(lower(column),'[^a-zA-Z0-9 ]','')).alias('sentence'))
def removePunctuation(column):
    """Removes punctuation, changes to lower case, and strips leading and trailing spaces.

    Note:
        Only spaces, letters, and numbers should be retained.  Other characters should should be
        eliminated (e.g. it's becomes its).  Leading and trailing spaces should be removed after
        punctuation is removed.

    Args:
        column (Column): A Column containing a sentence.

    Returns:
        Column: A Column named 'sentence' with clean-up operations applied.
    """
    
#     assert(isinstance(column, pyspark.sql.column.Column))
    assert(str(type(column)) == "<class 'pyspark.sql.column.Column'>")    
    
    columnNoPunct = regexp_replace(column, "[^a-zA-Z0-9 ]", "")
#     columnNoPunct = regexp_replace(column, string.punctuation, "")    
    columnLowerCase = lower(columnNoPunct)
    columnTrimmed = trim(columnLowerCase)
    
    return columnTrimmed
print(f"How many TX records did we get?")
tx_cnt = texas_df.count()
print(f"We got: %i " % tx_cnt)

#Rename our LoanRange column to an estimated loan amount to match the existing sub 150k loan data.
filtered_df = texas_df.select(
    col("LoanRange").alias("LoanAmount"), "City", "State", "Zip",
    "BusinessType", "NonProfit", "JobsRetained", "DateApproved", "Lender")

#Doing some regular expressions to replace the text values with the average dollar amount and turning the column into a double type
value_df = filtered_df.select("City", "State", "Zip", "BusinessType",
                              "NonProfit", "JobsRetained", "DateApproved",
                              "Lender")
value_df = filtered_df.withColumn(
    "LoanAmount",
    regexp_replace(col("LoanAmount"), "[a-z] \$5-10 million",
                   "7500000").cast("double"))
value_df = value_df.withColumn(
    'LoanAmount',
    regexp_replace(col("LoanAmount"), "[a-z] \$1-2 million",
                   "1500000").cast("double"))
value_df = value_df.withColumn(
    'LoanAmount',
    regexp_replace(col("LoanAmount"), "[a-z] \$5-10 million",
                   "7500000").cast("double"))
value_df = value_df.withColumn(
    'LoanAmount',
    regexp_replace(col("LoanAmount"), "[a-z] \$2-5 million",
                   "3500000").cast("double"))
value_df = value_df.withColumn(
    'LoanAmount',
    regexp_replace(col("LoanAmount"), "[a-z] \$350,000-1 million",
new_viagens_df = viagens_df.select(
    col('Identificador do processo de viagem').alias(
        'identificador_do_processo_de_viagem'),
    col('Situação').alias('situacao'),
    col('Código do órgão superior').alias('codigo_do_orgao_superior'),
    col('Nome do órgão superior').alias('nome_do_orgao_superior'),
    col('Código órgão solicitante').alias('codigo_orgao_solicitante'),
    col('Nome órgão solicitante').alias('nome_orgao_solicitante'),
    col('CPF viajante').alias('cpf_viajante'),
    col('Nome').alias('nome'),
    col('Cargo').alias('cargo'),
    col('Período - Data de início').alias('periodo_data_de_inicio'),
    col('Período - Data de fim').alias('periodo_data_de_fim'),
    col('Destinos').alias('destinos'),
    col('Motivo').alias('motivo'),
    regexp_replace(col('Valor diárias'), ",",
                   "").cast("decimal").alias("valor_diarias"),
    regexp_replace(col('Valor passagens'), ",",
                   "").cast("decimal").alias("valor_passagens"),
    regexp_replace(col('Valor outros gastos'), ",",
                   "").cast("decimal").alias("valor_outros_gastos")).cache()

# COMMAND ----------

#exercício 1

new_viagens_df.write.mode('overwrite').parquet(output_path + "viagens_parquet")

# COMMAND ----------

#exercício 2
viagens_df.coalesce(1).write.mode('overwrite')\
Exemple #7
0
# fill null values in Headline with ''
df_news = df_news.fillna({'Headline': ''})

# parse the timestamp in order to make time windows
df_news = df_news.withColumn('PublishDate1',
                             F.to_date('PublishDate', "yyyy-MM-dd HH:mm:ss"))
df_timestamped = df_news.select(['PublishDate1', 'Topic', 'Title', 'Headline'])

# drop duplicates
#df_timestamped = df_timestamped.dropDuplicates(['Title', 'Headline'])

# remove punctuation from text data, text to lower case and trim whitespaces
df_timestamped = df_timestamped.withColumn(
    'Title',
    F.trim(F.lower(F.regexp_replace(F.col('Title'), '[^\sa-zA-Z0-9]', ''))))
df_timestamped = df_timestamped.withColumn(
    'Headline',
    F.trim(F.lower(F.regexp_replace(F.col('Headline'), '[^\sa-zA-Z0-9]', ''))))

# tokenize titles and headlines
title_tokenizer = Tokenizer(inputCol='Title', outputCol='Title_words')
headline_tokenizer = Tokenizer(inputCol='Headline', outputCol='Headline_words')
df_timestamped = title_tokenizer.transform(df_timestamped)
df_timestamped = headline_tokenizer.transform(df_timestamped)

# remove stop words
titel_remover = StopWordsRemover(inputCol='Title_words',
                                 outputCol='Title_final')
headline_remover = StopWordsRemover(inputCol='Headline_words',
                                    outputCol='Headline_final')
Exemple #8
0
How-to get it done?

I've broken this task to 5 steps which are as following:

#Step-1: Break fits_assembly_name column into assembly_name & models
After observing the pattern, I've figured out following
    -   Model numbers are mentioned after hyphen(-) in fits_assembly_name column, and
    -   They are condensed to fit the space. E.g. Three different model_numbers V08AB26, V08GB26 and V08LB26 are written as V08AB26/GB26/LB26.
        -   POS has used "/" and mentioned only the part that is different from earlier model_numbers.

To pre-process the column Assembly_name, use regexp_replace + split to separate modelsnumbers into a new column and remove it from the original column Assembly_name:
    -   I've used "regexp_replace" and "selectExpr" funtions available in pyspark.sql.functions.
        -   I've broken the fits_assembly_name column string by " - " and created two new columns i.e. pc_Assemblyname_Withoutmodelno and Models column as following:        
        -   regexp_replace: Replace all substrings of the specified string value that match regexp with rep
            - Usage: regexp_replace(x, pattern, replacement)
        -   selectExpr: Projects a set of SQL expressions and returns a new DataFrame. (Source: https://spark.apache.org/docs/1.5.2/api/python/pyspark.sql.html)
```
from pyspark.sql.functions import regexp_replace, split

df0 = df.withColumn('new_col', split(regexp_replace('Assembly_name', r'^(.*)-\s*(\S+)(.*)$', '$1$3\0$2'),'\0')) \
    .selectExpr(
        'Itemno'
      , 'Assembly_id'
      , "coalesce(new_col[0], Assembly_name) as Assembly_name"
      , "coalesce(new_col[1], '') as models"
)

df0.show(truncate=False)
+-------+-----------+---------------------------------------------------------------+--------------------+
|Itemno |Assembly_id|Assembly_name                                                  |models              |
Exemple #9
0
# We then split the words into tokens.
# https://spark.apache.org/docs/latest/ml-features.html#tokenizer

# In[9]:
from pyspark.sql.functions import regexp_replace, trim, col, lower, udf
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.ml.feature import StopWordsRemover

timestart = datetime.datetime.now()

print("abstracts_full_df2.head() = {}".format(abstracts_full_df2.head()))

# Convert the content to Lower Case
print("Converting the abstarct to Lower Case ... ")
abstracts_full_df3 = abstracts_full_df2.withColumn("abstractNew", lower(col("abstract"))).\
    withColumn("abstractNew", regexp_replace("abstractNew", '[^\w-_ ]', ""))

abstracts_full_df3.printSchema()
# print("abstracts_full_df3.head() = {}".format(abstracts_full_df3.head()))

# Tokenize the Abstracts
print("tokenizating the abstracts... ")
tokenizer = Tokenizer(inputCol="abstractNew", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtWords")

abstracts_full_df4 = tokenizer.transform(abstracts_full_df3)

print("After tokenization: ")
abstracts_full_df4.printSchema()
print("abstracts_full_df4.count() = {}".format(abstracts_full_df4.count()))
# print("abstracts_full_df4.head() = {}".format(abstracts_full_df4.head()))
Exemple #10
0
#verificando la estructura del dataframe WordlCupMatches
df_wcm.printSchema()
print('El dataframe df_wc tiene ' + str(df_wcm.count()) + ' registros.')
print('El dataframe df_wc tiene ' + str(df_wcm.distinct().count()) +
      ' registros distintos.')

# COMMAND ----------

from pyspark.sql.functions import desc, asc, col, column, expr, instr, length, substring, regexp_replace, trim, lit, initcap, sum, concat
#
#Transformaciones al datraframe WordlCupPlayers
# El campo Event guarda un dato del tipo "G43' G87'", lo que significa que el jugador usar dos goles, por lo que necesitamos contar el número de "G"
# De igual manera podemos obtener el número de penales y tarjetas
df_wcp1=df_wcp.withColumn('POSICION_JUGADOR',expr("case when position='C' THEN  'Captain' WHEN position='GK' THEN 'Goalkeeper' ELSE 'Other' end "))\
             .withColumn('NOMBRE_JUGADOR', initcap(regexp_replace('Player Name','�','u')))\
             .withColumn('NUMERO_GOLES',length('Event')-length(trim(regexp_replace('Event','G',''))))\
             .withColumn('NUMERO_PENALES',length('Event')-length(trim(regexp_replace('Event','P',''))))\
             .withColumn('NUMERO_PENALES_FALLADOS',length('Event')-length(trim(regexp_replace('Event','MP',''))))\
             .withColumn('NUMERO_TARJETAS_ROJAS',length('Event')-length(trim(regexp_replace('Event','R',''))))
#Reemplazo los valores nulos con 0
df_wcp1=df_wcp1.withColumn('NUMERO_GOLES',expr("case when NUMERO_GOLES is null then 0 else NUMERO_GOLES end "))\
              .withColumn('NUMERO_PENALES',expr("case when NUMERO_PENALES is null then 0 else NUMERO_PENALES end "))\
              .withColumn('NUMERO_PENALES_FALLADOS',expr("case when NUMERO_PENALES_FALLADOS is null then 0 else NUMERO_PENALES_FALLADOS end "))\
              .withColumn('NUMERO_TARJETAS_ROJAS',expr("case when NUMERO_TARJETAS_ROJAS is null then 0 else NUMERO_TARJETAS_ROJAS end "))\
              .withColumnRenamed('Team Initials','INICIALES_PAIS')\
              .drop('Player Name','Position','Shirt Number')

#Sumarizo  por nombre de jugador , posición e iniciales de país
df_wcp_rep=df_wcp1.select('NOMBRE_JUGADOR','POSICION_JUGADOR','INICIALES_PAIS','NUMERO_GOLES','NUMERO_PENALES','NUMERO_PENALES_FALLADOS','NUMERO_TARJETAS_ROJAS')\
.groupby('NOMBRE_JUGADOR','POSICION_JUGADOR','INICIALES_PAIS')\
Exemple #11
0
import sys
from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.functions import col, regexp_replace

def hathiRecord(r):
    return dict([(f["@name"], f["#VALUE"]) for f in r.field])

if __name__ == "__main__":
    if len(sys.argv) < 3:
        print("Usage: pretty-cluster.py <input> <output>", file=sys.stderr)
        exit(-1)
    sc = SparkContext(appName="Prettyprint Clusters")
    sqlContext = SQLContext(sc)

    raw = sqlContext.read.format('com.databricks.spark.xml') \
                         .options(rowTag='doc') \
                         .load(sys.argv[1])

    sqlContext.createDataFrame(raw.map(hathiRecord), samplingRatio=1) \
        .withColumn('seq', col('seq').cast('int')) \
        .withColumnRenamed('htid', 'book') \
        .withColumnRenamed('content', 'text') \
        .withColumnRenamed('year', 'date') \
        .withColumn('text',
                    regexp_replace(regexp_replace(col('text'), '&', '&amp;'), '<', '&lt;')) \
        .repartition(200) \
        .write.save(sys.argv[2])
    
    sc.stop()
Exemple #12
0
print("Start %d Cities Cluster Test, NSlaves = %d" % (NCities, NSlaves), flush=True)
print("Execution time #0 %f" % (time.time() - startTime),flush=True)

# ## Read two mapping files into dataframes
# - Read files from Amazon S3 bucket into Spark dataframes
# - Format columns as required to enable joins to dataset below

# read and process city FIPS to county FIPS mapping file
city_to_fips = spark.read.format("org.apache.spark.csv").option("header","true") \
                    .csv(latlonFilename)
#                          .csv("/home/ubuntu/project/data/uscitiesLatLongFIPS.csv")

city_to_fips = city_to_fips.withColumn("county_FIPS", f.lpad(city_to_fips['county_FIPS'],5,"0"))
city_to_fips = city_to_fips.drop("city","zip","id","source","population")
city_to_fips = city_to_fips.withColumn("city_ascii", f.regexp_replace('city_ascii', 'Saint', 'St.'))
city_to_fips = city_to_fips.withColumnRenamed("city_ascii","CityName") \
                           .withColumnRenamed("state_name","StateDesc") \
                           .withColumnRenamed("county_FIPS","FIPS")

print((city_to_fips.count(), len(city_to_fips.columns)))
city_to_fips.limit(5).toPandas()


# read and process commuting zone to county FIPS mappingfile
cz_to_fips = spark.read.format("org.apache.spark.csv").option("header","true").option("delimiter", "\t") \
                  .csv(lmaFilename)
#                        .csv("/home/ubuntu/project/data/1990LMAascii.csv")
    
cz_to_fips = cz_to_fips.filter(cz_to_fips.FIPS !="None")
cz_to_fips = cz_to_fips.withColumn("stateabbrv", cz_to_fips["County Name"].substr(-2,99))
Exemple #13
0
dfStreams = dfparquetSrcStreams.withColumn("year",sf.split("createdDate","\-")[0]) \
                .withColumn("month",sf.split("createdDate","\-")[1]) \
                .withColumn("day",sf.split((sf.split((sf.split("createdDate","\-")[2]),"T")[0])," ")[0])

dfbaseDataTrans = dfTrans.select([col for col in dfTrans.columns])
dfbaseDataSummary = dfSummary.select([col for col in dfSummary.columns])
dfbaseDataStreams = dfStreams.select([col for col in dfStreams.columns if not col.startswith("streams_transactions_")])
dfbaseDataMapTblInt = dfStreams.select(sf.col("id"),sf.col("year"),sf.col("month"),sf.col("day"),sf.col("streams_id"),dfStreams.colRegex("`streams_transactions_[0-9_]+_id`"))
dfbaseDataMapTblStr = dfStreams.select(sf.col("id"),sf.col("year"),sf.col("month"),sf.col("day"),sf.col("streams_id"),dfStreams.colRegex("`streams_transactions_[a-zA-Z_]*`"))

dfbaseDataMapTblInt_1 = dfbaseDataMapTblInt.withColumn("streams_transactions_id", concat_udf(sf.sort_array(sf.array([col for col in dfbaseDataMapTblInt.columns if col.startswith("streams_transactions_")]))))

dfbaseDataMapTblInt_2 = dfbaseDataMapTblInt_1.select(sf.col("id"),sf.col("year"),sf.col("month"),sf.col("day"),sf.col("streams_id"),sf.col("streams_transactions_id"))

dfbaseDataMapTblInt_2 = dfbaseDataMapTblInt_2.withColumn("transactions_id",sf.explode(sf.split(sf.trim(sf.regexp_replace("streams_transactions_id","~"," "))," ")))
                                    # .withColumn("replace",sf.trim(sf.regexp_replace("streams_transactions_id","~"," "))) \

# dfbaseDataMapTblInt_2.show(10,False)

dfbaseDataTransFinal = dfbaseDataTrans.select(sf.col("id").alias("mongoId"),sf.col("year"),sf.col("month"),sf.col("day"),sf.col("transactions_id").alias("tran_transactions_id"),sf.col("transactions_value").alias("tran_transactions_value"),sf.col("transactions_date").alias("tran_transactions_date")).distinct()

dfbaseDataSummaryFinal = dfbaseDataSummary.select(sf.col("id").alias("mongoId"),sf.col("year"),sf.col("month"),sf.col("day"),sf.col("summary_end_date"),sf.col("summary_num_transactions"),sf.col("summary_start_date"),sf.col("summary_total_irregular_income"),sf.col("summary_total_regular_income"))

# dfbaseDataStreamsFinal = dfbaseDataStreams.select(sf.col("id"),sf.col("year"),sf.col("month"),sf.col("day"),sf.col("applicantId"),sf.col("applicationSource") \
                                   # ,sf.col("clientID"),sf.col("createdDate"),sf.col("loanApplicationId"),sf.col("mvpApplicantId") \
                                   # ,sf.col("noHit"),sf.col("successful"),sf.col("timestamp"),sf.col("updatedAt"),sf.col("createdDatePT") \
                                   # ,sf.col("transactions_id").alias("tran_transactions_id"),sf.col("transactions_value").alias("tran_transactions_value"),sf.col("transactions_date").alias("tran_transactions_date")).distinct()

dfbaseDataMapTblInt_3 = dfbaseDataMapTblInt_2.select(sf.col("id").alias("mongoId"),sf.col("year"),sf.col("month"),sf.col("day"),sf.col("streams_id").alias("mptbl_streams_id"),sf.col("transactions_id").alias("mptbl_transactions_id"))
Exemple #14
0
#twitter
datasource_tw = glueContext.create_dynamic_frame.from_catalog(
    database="dynamodb", table_name="twitter")
dynframe_twitter = ApplyMapping.apply(
    frame=datasource_tw,
    mappings=[("symbol", "string", "symbol", "string"),
              ("full_text", "string", "full_text", "string"),
              ("created_at", "string", "created_at", "string"),
              ("id", "long", "id", "long"),
              ("url", "string", "url", "string")])

#convert aws glue dynamicframes to spark dataframes
tw = dynframe_twitter.toDF()

#remove [\\n\\t\$#]
tw = tw.withColumn("full_text",
                   f.regexp_replace(f.col("full_text"), "[\\n\\t\$#]", ""))

#convert spark dataframes back to aws glue dynamicframes
dynframe_twitter = DynamicFrame.fromDF(tw, glueContext, "nested")

#partition to 1 to get a single s3 file as output
dynframe_output = dynframe_twitter.repartition(1)

datasink = glueContext.write_dynamic_frame.from_options(
    frame=dynframe_output,
    connection_type="s3",
    connection_options={"path": "s3://541304926041-twitter"},
    format="csv")
job.commit()
                      aws_secret_access_key=secret_key,
                      region_name="us-west-2")

srcfilePath = "s3://" + bucket + "/" + enriched_path + vendor + "/JSON/" + year + "/" + month + "/" + day + ""

tgtfilePath = "s3://" + bucket + "/" + enriched_path + vendor + "/Parquet/"

dfjson = sparkSession.read.format("json").option("multiline", "true").option(
    "inferSchema", "true").load(srcfilePath)

data = dfjson.withColumn("data", explode("DATA")).select("data.*")

# dfPT = data.withColumn("createdDatePT",sf.to_timestamp(udf_TZConversion(sf.regexp_replace(data.createdDate,"T"," ").cast("string"),sf.lit("UTC"),sf.lit("US/Pacific")),"yyyy-MM-dd HH:mm:ss"))
dfPT = data.withColumn(
    "createdDatePT",
    sf.from_utc_timestamp(sf.regexp_replace(data.createdDate, "T", " "),
                          "US/Pacific"))

df = dfPT.withColumn("year",sf.split("createdDate","\-")[0]) \
          .withColumn("month",sf.split("createdDate","\-")[1]) \
          .withColumn("day",sf.split((sf.split((sf.split("createdDate","\-")[2]),"T")[0])," ")[0])

dfbaseData = df.select([col for col in df.columns])

#dfbaseData.show(10,False)

dfrankedId = dfbaseData.withColumn("row_num", sf.row_number().over(Window.partitionBy("id").orderBy(sf.asc("updatedAt")))) \
                    .where(sf.col("row_num") == 1) \
                    .select(dfbaseData["*"])

dfrankedId.repartition(sf.col("year"),sf.col("month"),sf.col("day")) \
Exemple #16
0
def __launcher_exposure():
    """ launcher页曝光 """
    sql_0 = """ select site,title,grouping_id() id_1,count(custom_uuid) playNum,round(sum(unix_timestamp(exit)-unix_timestamp(enter))/3600,2) playTime,count(distinct custom_uuid) users,round(count(custom_uuid)/count(distinct custom_uuid),2) avgPlayNum,round((sum(unix_timestamp(exit)-unix_timestamp(enter))/count(distinct custom_uuid))/60,2) avgPlayTime
from sharp.launcher_page_stay where dt="{date_0}" and  exit >= enter and unix_timestamp(exit)-unix_timestamp(enter) <= 1800 group by site,title with cube """.format(
        date_0=__str_dt_0)
    sql_1 = """ select site,title,grouping_id() id_1,count(custom_uuid) playNum,round(sum(unix_timestamp(exit)-unix_timestamp(enter))/3600,2) playTime,count(distinct custom_uuid) users,round(count(custom_uuid)/count(distinct custom_uuid),2) avgPlayNum,round((sum(unix_timestamp(exit)-unix_timestamp(enter))/count(distinct custom_uuid))/60,2) avgPlayTime
from sharp.launcher_page_stay where dt="{date_1}" and exit >= enter and unix_timestamp(exit)-unix_timestamp(enter) <= 1800 group by site,title  with cube """.format(
        date_1=__str_dt_1)
    sql_7 = """ select site,title,grouping_id() id_1,count(custom_uuid) playNum,round(sum(unix_timestamp(exit)-unix_timestamp(enter))/3600,2) playTime,count(distinct custom_uuid) users,round(count(custom_uuid)/count(distinct custom_uuid),2) avgPlayNum,round((sum(unix_timestamp(exit)-unix_timestamp(enter))/count(distinct custom_uuid))/60,2) avgPlayTime
from sharp.launcher_page_stay where dt="{date_7}" and exit >= enter and unix_timestamp(exit)-unix_timestamp(enter) <= 1800 group by site,title  with cube """.format(
        date_7=__str_dt_7)
    spark.sql("show databases")
    spark.sql("use sharp")
    df_cube_0 = spark.sql(sql_0)
    df_cube_1 = spark.sql(sql_1)
    df_cube_7 = spark.sql(sql_7)
    # 连接条件
    left_conditions_0_1 = (F.coalesce(
        F.col("t_0.site"), F.lit("123")) == F.coalesce(
            F.col("t_1.site"), F.lit("123"))) & (F.coalesce(
                F.col("t_0.title"), F.lit("123")) == F.coalesce(
                    F.col("t_1.title"), F.lit("123"))) & (F.col("t_0.id_1")
                                                          == F.col("t_1.id_1"))
    left_conditions_0_7 = (F.coalesce(
        F.col("t_0.site"), F.lit("123")) == F.coalesce(
            F.col("t_7.site"), F.lit("123"))) & (F.coalesce(
                F.col("t_0.title"), F.lit("123")) == F.coalesce(
                    F.col("t_7.title"), F.lit("123"))) & (F.col("t_0.id_1")
                                                          == F.col("t_7.id_1"))
    # 最终报表
    report = df_cube_0.alias("t_0").join(
        df_cube_1.alias("t_1"), left_conditions_0_1, "left_outer"
    ).join(df_cube_7.alias("t_7"), left_conditions_0_7, "left_outer").select(
        F.regexp_replace(F.lit(__str_dt_0), "-", "").cast("int").alias("date"),
        F.col("t_0.site").alias("channelName"),
        F.col("t_0.title").alias("typeName"),
        F.col("t_0.id_1").alias("id_1"),
        F.col("t_0.playNum").alias("totalPlayNum"),
        F.concat(
            F.round((F.col("t_0.playNum") / F.col("t_1.playNum") - 1) * 100,
                    2), F.lit("%")).alias("playNumCompareDay"),
        F.concat(
            F.round((F.col("t_0.playNum") / F.col("t_7.playNum") - 1) * 100,
                    2), F.lit("%")).alias("playNumCompareWeek"),
        F.col("t_0.playTime").alias("totalPlayTime"),
        F.concat(
            F.round((F.col("t_0.playTime") / F.col("t_1.playTime") - 1) * 100,
                    2), F.lit("%")).alias("playTimeCompareDay"),
        F.concat(
            F.round((F.col("t_0.playTime") / F.col("t_7.playTime") - 1) * 100,
                    2), F.lit("%")).alias("playTimeCompareWeek"),
        F.col("t_0.users").alias("totalUserCount"),
        F.concat(
            F.round((F.col("t_0.users") / F.col("t_1.users") - 1) * 100, 2),
            F.lit("%")).alias("userCountCompareDay"),
        F.concat(
            F.round((F.col("t_0.users") / F.col("t_7.users") - 1) * 100, 2),
            F.lit("%")).alias("userCountCompareWeek"),
        F.col("t_0.avgPlayNum").alias("averagePlayNum"),
        F.concat(
            F.round(
                (F.col("t_0.avgPlayNum") / F.col("t_1.avgPlayNum") - 1) * 100,
                2), F.lit("%")).alias("avgPlayNumCompareDay"),
        F.concat(
            F.round(
                (F.col("t_0.avgPlayNum") / F.col("t_7.avgPlayNum") - 1) * 100,
                2), F.lit("%")).alias("avgPlayNumCompareWeek"),
        F.col("t_0.avgPlayTime").alias("averagePlayTime"),
        F.concat(
            F.round((F.col("t_0.avgPlayTime") / F.col("t_1.avgPlayTime") - 1) *
                    100, 2), F.lit("%")).alias("avgPlayTimeCompareDay"),
        F.concat(
            F.round((F.col("t_0.avgPlayTime") / F.col("t_7.avgPlayTime") - 1) *
                    100, 2), F.lit("%")).alias("avgPlayTimeCompareWeek"))
    return report
Exemple #17
0
from __future__ import print_function

import sys

from pyspark import SparkContext
from pyspark.sql import SQLContext
from pyspark.sql.functions import lit, concat, concat_ws, regexp_replace

if __name__ == "__main__":
    if len(sys.argv) != 3:
        print("Usage: trove-load.py <input json> <output parquet>", file=sys.stderr)
        exit(-1)
    sc = SparkContext(appName="Trove Load")
    sqlContext = SQLContext(sc)

    raw = sqlContext.read.json(sys.argv[1])
    df = raw.na.drop(subset=['id', 'fulltext']).dropDuplicates(['id'])
    df.select(concat(lit('trove/'), df.id).alias('id'),
              concat_ws('/', lit('trove'), df.titleId, df.date).alias('issue'),
              concat(lit('trove/'), df.titleId).alias('series'),
              df.date, df.firstPageId, df.firstPageSeq.cast('int').alias('seq'),
              df.heading.alias('title'), df.category,
              regexp_replace(regexp_replace(df.fulltext, '&', '&amp;'),
                             '<', '&lt;').alias('text'))\
      .write.save(sys.argv[2])

    sc.stop()
    res['name'] = book
    res['text'] = "\n".join(['<div class="page-break" page="%d">%s</div>' % (r.seq, r.text) for r in pp]) + ('<archiveid tokenizetagcontent="false">%s</archiveid>' % book)
    return Row(**res)

if __name__ == "__main__":
    if len(sys.argv) < 3:
        print("Usage: pretty-cluster.py <input> <page-out> <book-out>", file=sys.stderr)
        exit(-1)
    sc = SparkContext(appName="Proteus Pages")
    sqlContext = SQLContext(sc)

    raw = sqlContext.read.load(sys.argv[1])
    cols = set(raw.columns)
    idcols = [col(x) for x in ['identifier', 'issue', 'book'] if x in cols]

    df = raw.withColumn('identifier', regexp_replace(coalesce(*idcols), '[^A-Za-z0-9]+', ''))

    counts = df.groupBy('identifier').count().select(col('identifier'), col('count').alias('imagecount'))

    appendID = udf(lambda book, text: '%s <archiveid tokenizetagcontent="false">%s</archiveid>' % (text, book))

    renamed = df.join(counts, 'identifier')\
                .drop('regions')\
                .withColumn('pageNumber', col('seq'))\
                .withColumn('name', concat_ws('_', col('identifier'), col('seq')))\
                .withColumn('text', regexp_replace(col('text'), '\\n', '<br>\\\n'))

    renamed.withColumn('text', appendID(col('identifier'), col('text')))\
           .write.format('json').save(sys.argv[2])

    renamed.rdd.groupBy(lambda r: r.identifier).map(pageCat).toDF()\
Exemple #19
0
import sys

spark = SparkSession.builder.appName("my_pp").getOrCreate()

joined_df = spark.read.format('csv').options(
    header='false', inferschema='true').load(sys.argv[1]).select(
        F.col('_c0').alias('medallion'),
        F.col('_c3').alias('pickup_datetime'))

medallion_stats = joined_df.withColumn(
    "pickup_datetime", F.date_format(F.col("pickup_datetime"), "yyyy-MM-dd"))

medallion_stats = medallion_stats.groupBy(F.col('medallion')).agg(
    F.count('*').alias('total_trips'),
    F.countDistinct(F.col('pickup_datetime')).alias('days_driven'))

medallion_stats = medallion_stats.select(
    'medallion', 'total_trips', 'days_driven',
    F.regexp_replace(
        F.format_number(
            F.round(F.col('total_trips') / F.col('days_driven'), 2), 2), ',',
        '').alias('average')).sort('medallion')

medallion_stats.select(
    format_string('%s,%s,%s,%s', medallion_stats.medallion,
                  medallion_stats.total_trips, medallion_stats.days_driven,
                  medallion_stats.average)).write.save('task2d-sql.out',
                                                       format="text")

spark.stop()
SparkContext.setSystemProperty("hive.metastore.uris",
                               "http://192.168.58.24:8888")
spark.conf.set("spark.sql.crossJoin.enabled", "true")

master = spark.sql('SELECT * FROM dwhdb.master_matching')
#master.limit(10).toPandas()

delta = spark.sql('SELECT * FROM dwhdb.delta_matching')
#delta.limit(10).toPandas()

master = (master.withColumn(
    "clean_id",
    F.regexp_replace(
        F.trim(
            F.lower(F.regexp_replace('nomor_identitas', "[^a-zA-Z0-9\\s]",
                                     ""))), " +",
        " ")).withColumn(
            "clean_nama",
            F.regexp_replace(
                F.trim(
                    F.lower(
                        F.regexp_replace('nama_sesuai_identitas',
                                         "[^a-zA-Z0-9\\s]", ""))), " +",
                " ")).withColumn(
                    "clean_tgl_lahir",
                    F.regexp_replace(
                        F.trim(
                            F.lower(
                                F.regexp_replace('tanggal_lahir',
                                                 "[^a-zA-Z0-9\\s]", ""))),
    def classify_data(self, file_name):
        """Profiles columns from input file."""
        try:
            session_id = datetime.datetime.now(timezone).strftime("%Y-%m-%d %H:%M:%S")
            print("\n\n=====Python Job starts at {session_id}\n".format(session_id=session_id))

            save = HyperProfiling.get_url('/dataclass/out/save')
            update = HyperProfiling.get_url('/dataclass/que/update')

            if save.split(':')[0] and update.split(':')[0] != 'https':
                raise HttpProtocolException(save, update, save.split(':')[0], update.split(':')[0])

            session = HyperProfiling().get_session()

            hdfs_output_dir = '/{env}/EDW/DSODB/OPS/CHEVELLE/HYPER_PROF_RSLT/Output'.format(env=self._env)
            input_path = '/{env}/EDW/DSODB/OPS/CHEVELLE/HYPER_PROF_RSLT/Input/{file_name}'

            base_query = \
                """
                select 
                cast('{COL_ID}' as string) as col_id,
                cast('{DATA_CLS_NM}' as string) as data_cls_nm,
                cast('{CRT_TS}' as string) as crt_ts,
                cast('{PROF_START_TS}' as string) as prof_start_ts,
                cast('{PROF_END_TS}' as string) as prof_end_ts,
                cast('{TABLE_ID}' as string) as table_id,
                cast('{BATCH_ID}' as int) as batch_id,
                cast('{TOTAL_ROW_COUNT}' as int) as tot_row_cnt,
                cast('{SAMPLE_ROW_COUNT}' as int) as sample_row_cnt,
                cast(count(distinct({column})) as int) as col_val_uniq_cnt, 
                cast(sum(case when {column} REGEXP '{REGEX_STR}' then 1 else 0 end) as int) as col_val_data_cls_cnt,
                cast(max({column}) as string) as col_max_val,
                cast(min({column}) as string) as col_min_val, 
                cast(avg(length({column})) as int) as col_avg_len,
                cast('{REGEX_STR}' as string) as appl_regex_str,
                cast('{CRT_BY}' as string) as crt_by 
                from {temp_table}
                """

            file_schema = StructType([
                    StructField("hive_schema", StringType()),
                    StructField("table_name", StringType()),
                    StructField("col_name", StringType()),
                    StructField("data_cls_nm", StringType()),
                    StructField("regex_str", StringType()),
                    StructField("table_id", StringType()),
                    StructField("col_id", StringType()),
                    StructField("batch_id", StringType()),
                ])

            empty_schema = StructType([
                StructField("col_id", StringType()),
                StructField("data_cls_nm", StringType()),
                StructField("crt_ts", StringType()),
                StructField("prof_start_ts", StringType()),
                StructField("prof_end_ts", StringType()),
                StructField("table_id", StringType()),
                StructField("batch_id", StringType()),
                StructField("tot_row_cnt", StringType()),
                StructField("sample_row_cnt", StringType()),
                StructField("col_val_uniq_cnt", StringType()),
                StructField("col_val_data_cls_cnt", StringType()),
                StructField("col_max_val", StringType()),
                StructField("col_min_val", StringType()),
                StructField("col_avg_len", StringType()),
                StructField("appl_regex_str", StringType()),
                StructField("crt_by", StringType())
            ])

            file_ = input_path.format(env=self._env, file_name=file_name)
            print(file_)

            schema_check = self._spark.read.csv(file_, sep='\x1c')
            schema_length = len(schema_check.columns)

            file_extension = file_name.split('.')[1]
            if file_extension not in ('txt', 'csv', 'dat'):
                raise InvalidFileExtension(file_name, file_extension)
            elif schema_length < 8:
                raise InvalidSchemaException(file_name, schema_length)
            else:
                print("File is ok to process")

            input_file_base = self._spark.read.csv(file_, sep='\x1c', schema=file_schema)
            input_file = input_file_base.withColumn('hive_schema', regexp_replace('hive_schema', "\-", "_"))
            input_restructured = input_file. \
                select('table_id',
                       'col_id',
                       'batch_id',
                       'data_cls_nm',
                       'col_name',
                       'regex_str',
                       F.concat_ws('.', 'hive_schema', 'table_name').alias('table')). \
                orderBy('table', ascending=False)
            input_tables = input_restructured.select(input_restructured['table']).dropDuplicates()
            input_tables_broadcast = self._spark.sparkContext.broadcast(input_tables.rdd.collect())

            application_start = time.time()

            for tables in input_tables_broadcast.value:

                rows_to_profile = \
                    input_restructured.select('table_id',
                                              'col_id',
                                              'batch_id',
                                              'col_name',
                                              'data_cls_nm',
                                              'regex_str'). \
                        where(input_restructured['table'] == '{table}'.format(table=tables.table)).toLocalIterator()

                for row in list(rows_to_profile):
                    col_id = row.col_id
                    table_id = row.table_id
                    batch_id = int(row.batch_id)
                    data_cls_nm = row.data_cls_nm
                    col_name = row.col_name
                    regex_str = row.regex_str. \
                        replace(chr(169), chr(92)). \
                        replace(chr(171), chr(123)). \
                        replace(chr(187), chr(125))

                    empty_ts = datetime.datetime.now(timezone).strftime("%Y-%m-%d %H:%M:%S")

                    failed_update_schema = \
                        {
                            'col_id': col_id,
                            'data_cls_nm': data_cls_nm,
                            'batch_id': batch_id,
                            'task_stat_cd': 3,
                            'fail_cnt': 0
                        }

                    success_update_schema = \
                        {
                            'col_id': col_id,
                            'data_cls_nm': data_cls_nm,
                            'batch_id': batch_id,
                            'task_stat_cd': 2,
                            'fail_cnt': 0
                        }

                    empty_update_schema = \
                        {
                            'col_id': col_id,
                            'data_cls_nm': data_cls_nm,
                            'batch_id': batch_id,
                            'task_stat_cd': 4,
                            'fail_cnt': 0
                        }

                    empty_record = (
                        col_id,
                        data_cls_nm,
                        empty_ts,
                        empty_ts,
                        empty_ts,
                        table_id,
                        batch_id,
                        "",
                        "",
                        "",
                        "",
                        "",
                        "",
                        "",
                        regex_str.replace(chr(92), chr(169)).replace(chr(123), chr(171)).replace(chr(125), chr(187)),
                        "chevelle"
                    )

                    try:
                        table_data = self._spark.table(tables.table)
                        data_empty = table_data.rdd.isEmpty()
                    except:

                        try:
                            update_output = UpdateQueue({'update': update, 'session': session})
                            update_response = update_output.save_results({'update_output': empty_update_schema})
                            empty_df = self._spark.createDataFrame([empty_record], empty_schema)
                            empty_df.write.json(hdfs_output_dir, mode='ignore')
                            payload = json.loads(empty_df.toJSON().collect()[0], object_pairs_hook=OrderedDict)
                            output = SaveOutput({'save': save, 'session': session})
                            output_response = output.save_results({'payload': payload})
                            if update_response['status'] != 200:
                                summary_msg = '{output}\tCRITICAL FAIL\tCheck Update Endpoint'.format(
                                    output=json.dumps(failed_update_schema))
                                print(summary_msg)
                                sys.exit(0)
                            elif output_response['status'] != 200:
                                raise Exception
                        except Exception:
                            update_output = UpdateQueue({'update': update, 'session': session})
                            failed_update_response = update_output.save_results({'update_output': failed_update_schema})
                            exc_type, exc_value, exc_tb = sys.exc_info()
                            print(traceback.format_exception(exc_type, exc_value, exc_tb))
                            if failed_update_response['status'] != 200:
                                summary_msg = '{output}\tCRITICAL FAIL\tCheck Update Endpoint'.format(
                                    output=json.dumps(failed_update_schema))
                                print(summary_msg)
                                sys.exit(0)
                            else:
                                failed_log = {'content': json.dumps(failed_update_schema),'status': failed_update_response['status']}
                                r = requests.post(
                                    'https://chevelle-elk-logger.cp-epg2i.domain.com/logging/chevelle_dc/stdout',
                                    json=failed_log)
                                r.close()
                                continue

                        update_log = {'content': json.dumps(empty_update_schema), 'status':update_response['status']}
                        r = requests.post('https://chevelle-elk-logger.cp-epg2i.domain.com/logging/chevelle_dc/stdout',
                                          json=update_log)
                        r.close()
                        continue

                    else:

                        if not data_empty:

                            try:
                                table_data_column = table_data.select(col_name). \
                                        where(F.length(col_name) > 0). \
                                        where(lower(trim(table_data[col_name])).
                                              isin(' ', 'null', 'n/a', 'unknown', 'unk', 'unspecified', 'no match row id',
                                                   '__not_applicable__') == False)
                            except Exception:

                                try:
                                    update_output = UpdateQueue({'update': update, 'session': session})
                                    update_response = update_output.save_results({'update_output': empty_update_schema})
                                    empty_df = self._spark.createDataFrame([empty_record], empty_schema)
                                    empty_df.write.json(hdfs_output_dir, mode='ignore')
                                    payload = json.loads(empty_df.toJSON().collect()[0], object_pairs_hook=OrderedDict)
                                    output = SaveOutput({'save': save, 'session': session})
                                    output_response = output.save_results({'payload': payload})
                                    if output_response['status'] != 200:
                                        raise Exception
                                except Exception:
                                    update_output = UpdateQueue({'update': update, 'session': session})
                                    update_response = update_output.save_results({'update_output': failed_update_schema})
                                    exc_type, exc_value, exc_tb = sys.exc_info()
                                    print(traceback.format_exception(exc_type, exc_value, exc_tb))
                                    if update_response['status'] != 200:
                                        summary_msg = '{output}\tCRITICAL FAIL\tCheck Update Endpoint'.format(
                                            output=json.dumps(failed_update_schema))
                                        print(summary_msg)
                                        sys.exit(0)
                                    else:
                                        print(failed_update_schema, update_response['status'])
                                        continue

                                print(empty_update_schema, update_response['status'])
                                continue

                            else:

                                column_row_count_full = table_data_column.count()
                                output_strings = \
                                    {
                                        'COL_ID': col_id,
                                        'DATA_CLS_NM': data_cls_nm,
                                        'PROF_START_TS': 'START',
                                        'PROF_END_TS': 'END',
                                        'BATCH_ID': batch_id,
                                        'TABLE_ID': table_id,
                                        'TOTAL_ROW_COUNT': column_row_count_full,
                                        'SAMPLE_ROW_COUNT': column_row_count_full,
                                        'CRT_BY': 'chevelle',
                                        'CRT_TS': 'CRT',
                                        'REGEX_STR': regex_str,
                                        'column': col_name,
                                        'temp_table': 'temp_'
                                    }

                                if column_row_count_full >= 1000000:

                                    column_sample = table_data_column.sample(False, 0.1)
                                    column_sample.createOrReplaceTempView('temp_sample')
                                    column_sample_count = column_sample.count()

                                    output_strings['SAMPLE_ROW_COUNT'] = column_sample_count
                                    output_strings['temp_table'] = 'temp_sample'

                                    try:
                                        profile_start = datetime.datetime.now(timezone).strftime("%Y-%m-%d %H:%M:%S")

                                        output_strings['PROF_START_TS'] = profile_start
                                        results = self._spark.sql(base_query.format(**output_strings))

                                        profile_end = datetime.datetime.now(timezone).strftime("%Y-%m-%d %H:%M:%S")

                                        final_results = \
                                            results.\
                                                replace('END', profile_end, 'PROF_END_TS').\
                                                replace('CRT', profile_end, 'CRT_TS')
                                        update_output = UpdateQueue({'update': update, 'session': session})
                                        update_response = update_output.save_results({'update_output': success_update_schema})
                                        final_results.write.json(hdfs_output_dir, mode='ignore')
                                        payload = json.loads(final_results.toJSON().collect()[0], object_pairs_hook=OrderedDict)
                                        payload['appl_regex_str'] =  \
                                            payload['appl_regex_str'].\
                                                replace(chr(92), chr(169)).\
                                                replace(chr(123), chr(171)).\
                                                replace(chr(125), chr(187))

                                        output = SaveOutput({'save': save, 'session': session})
                                        output_response = output.save_results({'payload': payload})
                                        if output_response['status'] != 200:
                                            raise Exception
                                    except Exception:
                                        update_output = UpdateQueue({'update': update, 'session': session})
                                        update_response = update_output.save_results({'update_output': failed_update_schema})
                                        exc_type, exc_value, exc_tb = sys.exc_info()
                                        print(traceback.format_exception(exc_type, exc_value, exc_tb))
                                        if update_response['status'] != 200:
                                            summary_msg = '{output}\tCRITICAL FAIL\tCheck Update Endpoint'.format(
                                                output=json.dumps(failed_update_schema))
                                            print(summary_msg)
                                            sys.exit(0)
                                        else:
                                            print(failed_update_schema, update_response['status'])
                                            continue
                                    print(success_update_schema, update_response['status'])

                                elif 0 < column_row_count_full < 1000000:

                                    table_data_column.createOrReplaceTempView('temp_')

                                    try:
                                        profile_start = datetime.datetime.now(timezone).strftime("%Y-%m-%d %H:%M:%S")
                                        output_strings['PROF_START_TS'] = profile_start
                                        results = self._spark.sql(base_query.format(**output_strings))
                                        profile_end = datetime.datetime.now(timezone).strftime("%Y-%m-%d %H:%M:%S")
                                        final_results = \
                                            results.\
                                                replace('END', profile_end, 'PROF_END_TS').\
                                                replace('CRT', profile_end, 'CRT_TS')
                                        update_output = UpdateQueue({'update': update, 'session': session})
                                        update_response = update_output.save_results({'update_output': success_update_schema})
                                        final_results.write.json(hdfs_output_dir, mode='ignore')
                                        payload = json.loads(final_results.toJSON().collect()[0], object_pairs_hook=OrderedDict)
                                        payload['appl_regex_str'] = \
                                            payload['appl_regex_str']. \
                                                replace(chr(92), chr(169)). \
                                                replace(chr(123), chr(171)). \
                                                replace(chr(125), chr(187))
                                        output = SaveOutput({'save': save, 'session': session})
                                        output_response = output.save_results({'payload': payload})
                                        if output_response['status'] != 200:
                                            raise Exception
                                    except Exception:

                                        update_output = UpdateQueue({'update': update, 'session': session})
                                        update_response = update_output.save_results({'update_output': failed_update_schema})
                                        exc_type, exc_value, exc_tb = sys.exc_info()
                                        print(traceback.format_exception(exc_type, exc_value, exc_tb))
                                        if update_response['status'] != 200:
                                            summary_msg = '{output}\tCRITICAL FAIL\tCheck Update Endpoint'.format(
                                                output=json.dumps(failed_update_schema))
                                            print(summary_msg)
                                            sys.exit(0)
                                        else:
                                            print(failed_update_schema, update_response['status'])
                                            continue
                                    print(success_update_schema, update_response['status'])

                        else:

                            try:
                                update_output = UpdateQueue({'update': update, 'session': session})
                                update_response = update_output.save_results({'update_output': empty_update_schema})
                                empty_df = self._spark.createDataFrame([empty_record], empty_schema)
                                empty_df.write.json(hdfs_output_dir, mode='ignore')
                                payload = json.loads(empty_df.toJSON().collect()[0], object_pairs_hook=OrderedDict)
                                output = SaveOutput({'save': save, 'session': session})
                                output_response = output.save_results({'payload': payload})
                                if output_response['status'] != 200:
                                    raise Exception
                            except Exception:
                                update_output = UpdateQueue({'update': update, 'session': session})
                                update_response = update_output.save_results({'update_output': failed_update_schema})
                                exc_type, exc_value, exc_tb = sys.exc_info()
                                print(traceback.format_exception(exc_type, exc_value, exc_tb))
                                if update_response['status'] != 200:
                                    summary_msg = '{output}\tCRITICAL FAIL\tCheck Update Endpoint'.format(
                                        output=json.dumps(failed_update_schema))
                                    print(summary_msg)
                                    sys.exit(0)
                                else:
                                    print(failed_update_schema, update_response['status'])
                                    continue

                            print(empty_update_schema, update_response['status'])
                            continue

        except:
            sys.exit(0)

        application_end_time = time.time()
        print('-- Application Run time --')
        print(str(application_end_time - application_start))
        sys.exit(0)
Exemple #22
0
# This file is called word_counter.py
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

spark = SparkSession.builder.getOrCreate()

df = spark.read.text(paths='/job/samples/word_counter.py')
# Replace code chars with spaces
df = df.withColumn('value', F.regexp_replace('value', '\W', ' '))
# Split on spaces
df = df.select(F.explode(F.split('value', ' ')).alias('word'))
# Filter min length
df = df.where(F.length('word') > 0)
# Group by occurence and order
agg = (df.groupBy('word').count().sort('count', ascending=False))
# Print top result
agg.limit(5).show()
df_new = df_new.fillna({'Time': 3})
#Removing na locaions of violation location and violation count
df_new = df_new.dropna(how='any',
                       subset=['Violation Location', 'Violation County'])
#Fill na of these columns using  respective max values
# cols = ['Vehicle Body Type','Vehicle Make','Violation County','Violation In Front Of Or Opposite']
# agg_expr = [mode(f.collect_list(col)).alias(col) for col in cols]
# max_vals = df_new.agg(*agg_expr).collect()[0]
# df_new = df_new.fillna({'Vehicle Body Type':max_vals['Vehicle Body Type'],'Vehicle Make':max_vals['Vehicle Make'],'Violation County':max_vals['Violation County'],'Violation In Front Of Or Opposite':max_vals['Violation In Front Of Or Opposite']})
df_new = df_new.dropna(how='any')
#Renaming columns
names = df_new.schema.names
for name in names:
    df_new = df_new.withColumnRenamed(name, name.replace(" ", "_"))
#Mapping violation location
df_new = df_new.withColumn('Violation_Location', regexp_replace('Violation_Location', 'KINGS', 'K'))\
.withColumn('Violation_Location', regexp_replace('Violation_Location', 'KING', 'K'))\
.withColumn('Violation_Location', regexp_replace('Violation_Location', 'QUEEN', 'Q'))\
.withColumn('Violation_Location', regexp_replace('Violation_Location', 'QU', 'Q'))\
.withColumn('Violation_Location', regexp_replace('Violation_Location', 'NEWY', 'NY'))\
.withColumn('Violation_Location', regexp_replace('Violation_Location', 'NEW Y', 'NY'))\
.withColumn('Violation_Location', regexp_replace('Violation_Location', 'MAN', 'NY'))\
.withColumn('Violation_Location', regexp_replace('Violation_Location', 'MH', 'NY'))\
.withColumn('Violation_Location', regexp_replace('Violation_Location', 'BRONX', 'BX'))

############################################################################## TRAINING PIPELINE ####################################################################################33
#Label encoding pipeline
#Split the data
train, test = df_new.randomSplit([0.93, 0.07])
indexers = [
    StringIndexer(inputCol=column,
Exemple #24
0
#Turning Text into Tables
""" 
- remove punctuation and numbers
- tokenize (split into individual words)
- remove stop words
- apply the hashing trick
- convert to TF-IDF representation.
"""

# Import the necessary functions
from pyspark.sql.functions import regexp_replace
from pyspark.ml.feature import Tokenizer

# Remove punctuation (REGEX provided) and numbers
wrangled = sms.withColumn('text',
                          regexp_replace(sms.text, '[_():;,.!?\\-]', ' '))
wrangled = wrangled.withColumn('text',
                               regexp_replace(wrangled.text, '[0-9]', ' '))

# Merge multiple spaces
wrangled = wrangled.withColumn('text', regexp_replace(wrangled.text, ' +',
                                                      ' '))

# Split the text into words
wrangled = Tokenizer(inputCol='text', outputCol='words').transform(wrangled)

wrangled.show(4, truncate=False)

#########################################################################################

#Stop words and hashing
Exemple #25
0
                         F.concat(
                             F.lit('<dccon> '),
                             F.regexp_extract(col, r'src="[^?]*\?no=([^"]+)"',
                                              1), F.lit(' '),
                             F.regexp_extract(col, r'title="([^"]*)"',
                                              1))).otherwise(F.col(col)))


d2c_df = df.selectExpr('gallery_id', 'title', 'author',
                       'EXPLODE(comments) as comment')
d2c_df = d2c_df.selectExpr('gallery_id', 'title', 'author', 'comment.contents as comment', 'comment.author as comm_author')\
    .filter(F.col('author') != F.col('comm_author'))\
    .select('gallery_id', 'title', 'comment')
d2c_df = d2c_df.filter((~F.col('comment').startswith('<div'))
                       & (~F.col('comment').isNull()))
d2c_df = d2c_df.withColumn('title', F.regexp_replace('title', r'[\s\n\t]+',
                                                     ' '))
d2c_df = d2c_df.withColumn('comment',
                           F.regexp_replace('comment', r'[\s\n\t]+', ' '))
d2c_df = dccon_parse(d2c_df, 'comment')
d2c_df = d2c_df.selectExpr('''CONCAT("text:", gallery_id, "¶", title, '\t',
                              'labels:', comment, '\t', "episode:done") AS episode '''
                           )
d2c_df = d2c_df.distinct()


c2c_df = df.selectExpr(
    'gallery_id', 'id as document_id', 'author as document_author',
    'EXPLODE(comments) AS comment')\
    .select('gallery_id', 'document_id', 'document_author', 'comment.author', 'comment.contents',
            'comment.created_at', F.coalesce('comment.parent_id', 'comment.id').alias('root_id'))
c2c_df = c2c_df.filter((~F.col('contents').startswith('<div'))
Exemple #26
0
df = dfparquet.withColumn("year",sf.split("createdDate","\-")[0]) \
          .withColumn("month",sf.split("createdDate","\-")[1]) \
          .withColumn("day",sf.split((sf.split((sf.split("createdDate","\-")[2]),"T")[0])," ")[0])

dfbaseData = df.select([col for col in df.columns])

dfjoin = dfbaseData.join(dfrmi,(dfbaseData.loanApplicationId == dfrmi.loan_application_id) & \
                                    (sf.unix_timestamp(dfbaseData.createdDate) - sf.unix_timestamp(dfrmi.date_created) >= 0),'left_outer') \
                    .join(dfrmsNS,(dfrmi.id == dfrmsNS.input_id),'left_outer') \
                    .select(dfbaseData.id \
                    ,dfbaseData.applicantId \
                    ,dfbaseData.applicationSource \
                    ,dfbaseData.mvpApplicantId \
                    ,dfbaseData.loanApplicationId \
                    # ,dfbaseData.mvpLoanApplicationId \
                    ,sf.regexp_replace(dfbaseData.createdDate,"T"," ").cast(TimestampType()).alias("telesignTimestampUTC") \
                    ,dfbaseData.createdDatePT.alias("telesignTimestampPT") \
                    ,dfrmi.id.alias("rmiId") \
                    ,dfrmi.date_created.alias("riskTimestampPT") \
                    ,dfrmsNS.score_type.alias("scoreType") \
                    ,dfbaseData.year \
                    ,dfbaseData.month \
                    ,dfbaseData.day)

dfrankedId =  dfjoin.withColumn("row_num", sf.row_number().over(Window.partitionBy("loanApplicationId","telesignTimestampPT").orderBy(sf.desc("riskTimestampPT"),sf.desc("scoreType")))) \
                    .where(sf.col("row_num") == 1) \
                    .select(dfjoin["*"])

dfsplitColDevice = dfbaseData.select(sf.col("id").alias("id")
                                    ,sf.col("device_info_imei").alias("imei")
                                    ,sf.col("device_info_make").alias("make")
Exemple #27
0
taxiTripsRaw = sparkSession.read.csv(path=filePath,
                                     header=True,
                                     schema=schemaTaxiTrips,
                                     timestampFormat="MM/dd/yyyy hh:mm:ss a",
                                     mode="DROPMALFORMED")
# Limpieza de los datos
taxiTrips = taxiTripsRaw.select(
    "trip_id", "taxi_id", "trip_start_timestamp", "trip_end_timestamp",
    taxiTripsRaw["trip_seconds"].astype('integer').alias("trip_seconds"),
    taxiTripsRaw["trip_miles"].astype('integer').alias("trip_miles"),
    "pickup_census_tract", "dropoff_census_tract",
    taxiTripsRaw["pickup_community_area"].astype('integer').alias(
        "pickup_community_area"),
    taxiTripsRaw["dropoff_community_area"].astype('integer').alias(
        "dropoff_community_area"),
    F.regexp_replace(taxiTripsRaw["fare"], '[\$,)]',
                     '').astype('double').alias("fare"),
    F.regexp_replace(taxiTripsRaw["tips"], '[\$,)]',
                     '').astype('double').alias("tips"),
    F.regexp_replace(taxiTripsRaw["tolls"], '[\$,)]',
                     '').astype('double').alias("tolls"),
    F.regexp_replace(taxiTripsRaw["extras"], '[\$,)]',
                     '').astype('double').alias("extras"),
    F.regexp_replace(taxiTripsRaw["trip_total"], '[\$,)]',
                     '').astype('double').alias("trip_total"), "payment_type",
    "company", "pickup_centroid_latitude", "pickup_centroid_longitude",
    "pickup_centroid_location", "dropoff_centroid_latitude",
    "dropoff_centroid_longitude", "dropoff_centroid_location",
    F.year(taxiTripsRaw["trip_start_timestamp"]).alias("year"),
    F.month(taxiTripsRaw["trip_start_timestamp"]).alias("month"))

# Escritura de los datos transformados en S3 (AWS) o HDFS (local), particionados por año y mes
import pyspark.sql.functions as f
from matplotlib import pyplot as plt
from pyspark.sql.types import StructType, StringType, DoubleType, IntegerType, BooleanType, StructField, LongType, DateType,TimestampType, FloatType
from pyspark.ml.classification import NaiveBayes, NaiveBayesModel, LogisticRegression, LogisticRegressionModel
from pyspark.ml.evaluation import MulticlassClassificationEvaluator, BinaryClassificationEvaluator
from pyspark.ml.linalg import Vectors, VectorUDT
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoder
from pyspark.ml import Pipeline

# COMMAND ----------

bids_train = spark.read.parquet("s3://rtl-databricks-datascience/lpater/processed_data/bids_train.parquet/")

# COMMAND ----------

bids_train = bids_train.withColumn('new_deal_id', f.regexp_replace('deal_id',"\.|18ff3",""))\
.drop('deal_id')\
.withColumnRenamed('new_deal_id','deal_id')

bids_train.cache()

# COMMAND ----------

deal_ids = bids_train.select("deal_id").distinct()

deal_ids_list = list(deal_ids.select("deal_id").toPandas()["deal_id"])

# COMMAND ----------

for deal_id in deal_ids_list:
  temp_bids = bids_train.filter(bids_train.deal_id==deal_id).select("max_bid")
import pyspark.sql.functions as F
from pyspark.sql.functions import explode
import sqlite3
import pandas

spark = SparkSession.builder.appName('move_ru parse').getOrCreate()
filePath = sys.argv[1] + '/result_01.csv'
conn = sqlite3.connect('movedatabase.db')

# загрузка данных из файла
df = spark.read.csv(filePath, inferSchema=True, header=False)

# очистка полученных данных
split_column = F.split(df['_c2'], ' ')
df = df.withColumnRenamed('_c0', 'flat_id') \
    .withColumn('city_type', F.regexp_replace(split_column.getItem(2), r'[^а-я]', '')) \
    .withColumn('city', F.split(df['_c2'], '/').getItem(1)) \
    .withColumn('rooms', F.split(df['_c5'], ':').getItem(2)) \
    .withColumn('m2', F.split(df['_c5'], ' ').getItem(2)) \
    .withColumn('price', F.regexp_replace(F.col('_c1'), r'\D', '')) \
    .withColumn('price_m', F.round((F.col('price') / F.col('m2')), 0)) \
    .withColumn('m2_room', F.regexp_replace(F.split(df['_c5'], ':').getItem(1), r'м2 Комнат', '')) \
    .withColumn('floor', F.split(df['_c5'], ' ').getItem(0)) \
    .withColumn('region', split_column.getItem(0)) \
    .withColumn('highway', F.split(df['_c3'], ' ').getItem(0)) \
    .withColumn('mkad_km', F.split(df['_c4'], ' ').getItem(0)) \
    .withColumnRenamed('_c6', 'update_date') \
    .drop('_c1', '_c2', '_c3', '_c4', '_c5')
# .write.csv(sys.argv[1]+'/result_clean', header = True)

# запись полученных данных в базу данных
Exemple #30
0
          .withColumn("day",sf.split((sf.split((sf.split("createdDate","\-")[2]),"T")[0])," ")[0])

for colm in df.columns:
    if colm.startswith("resultcode_"):
        df = df.withColumn(colm, sf.concat(sf.lit(colm + "^"), sf.col(colm)))

df_resultcode = df.withColumn(
    "resultcodeArray",
    sf.array([col for col in df.columns if col.startswith("resultcode_")]))

dfexplode = df_resultcode.select(sf.col("id"),sf.col("mvpLoanApplicationId"),sf.col("loanApplicationId"), \
             sf.col("clientID"),sf.col("mvpClientID"),sf.col("createdDatePT"),sf.col("year"),sf.col("month"),sf.col("day") \
            ,sf.explode_outer("resultcodeArray").alias("resultcodes") \
            ).where(sf.col("resultcodes").isNotNull())

dfsplitCol = dfexplode.withColumn("result_key",sf.regexp_replace(sf.split("resultcodes","\^")[0],"_",".")) \
                      .withColumn("result_message",sf.split("resultcodes","\^")[1]) \
                      .drop("resultcodes")

dfbaseData = df.select(
    [col for col in df.columns if not col.startswith("resultcode_")])

dfjoin = dfbaseData.join(dfrmi,(dfbaseData.loanApplicationId == dfrmi.loan_application_id) & \
                                    (sf.unix_timestamp(dfbaseData.createdDatePT) - sf.unix_timestamp(dfrmi.date_created) >= 0),'left_outer') \
                    .join(dfrmsNS,(dfrmi.id == dfrmsNS.input_id),'left_outer') \
                    .select(dfbaseData.id, \
                    dfbaseData.mvpLoanApplicationId, \
                    dfbaseData.loanApplicationId, \
                    dfbaseData.clientID, \
                    dfbaseData.mvpClientID, \
                    sf.regexp_replace(dfbaseData.createdDate,"T"," ").cast(TimestampType()).alias("idologyTimestampUTC"), \
srcfilePathParquetGold = "s3://" + bucket + "/" + gold_path + vendor + "/timestamp_date=" + year + "-" + month + "-" + day + ""
srcfilePathParquet = "s3://" + bucket + "/" + enriched_path + vendor + "/Parquet/year=" + year + "/month=" + month + "/day=" + day + ""

tgtfilePathAudits = "s3://" + bucket + "/" + enriched_path + "Audits/" + vendor + "/"

dfparquetGold = sparkSession.read.format("parquet").load(
    srcfilePathParquetGold)

dfparquet = sparkSession.read.format("parquet").load(srcfilePathParquet)

dfparquetGold = dfparquetGold.withColumn("gold_date_created_utc",sf.from_unixtime(dfparquetGold.timestamp/1000,'YYYY-MM-dd').substr(1, 7)) \
                            # .where(dfparquetGold.successful == "true")

dfparquet = dfparquet.withColumn(
    "enriched_date_created_utc",
    sf.regexp_replace(dfparquet.createdDate, "T", " ").substr(1, 7))

# dfparquetGoldStg = dfparquetGold.groupBy(dfparquetGold.date_created_utc).count().distinct().orderBy(sf.asc("date_created_utc"))
# dfparquetGoldFinal = dfparquetGoldStg.select(dfparquetGoldStg.date_created_utc, dfparquetGoldStg.count)

dfparquetGoldStg = dfparquetGold.groupBy(
    dfparquetGold.gold_date_created_utc).agg(
        countDistinct("_id").alias("gold_cnt_id")).orderBy(
            sf.asc("gold_date_created_utc"))

dfparquetGoldFinal = dfparquetGoldStg.select(
    dfparquetGoldStg.gold_date_created_utc, dfparquetGoldStg.gold_cnt_id)

# dfparquetGoldFinal.show(1, False)

dfparquetStg = dfparquet.groupBy(dfparquet.enriched_date_created_utc).agg(
    trim(df.extra_people)).withColumn("monthly_price", trim(
        df.monthly_price)).withColumn("price", trim(df.price)).withColumn(
            "security_deposit", trim(df.security_deposit)).withColumn(
                "weekly_price",
                trim(df.weekly_price)).withColumn("host_response_rate",
                                                  trim(df.host_response_rate))

df = df.fillna('0', [
    'cleaning_fee', 'extra_people', 'monthly_price', 'price',
    'security_deposit', 'weekly_price'
])
df = df.fillna(datetime.datetime.now().strftime("%M/%d/%Y"), ['last_scraped'])
print("Done Filling zeros")
print(df.columns)

df = df.withColumn("cleaning_fee",regexp_replace(col("cleaning_fee"), "[^\d*\.?\d+]", ""))\
        .withColumn("extra_people",regexp_replace(col("extra_people"), "[^\d*\.?\d+]", ""))\
        .withColumn("monthly_price",regexp_replace(col("monthly_price"), "[^\d*\.?\d+]", ""))\
        .withColumn("price",regexp_replace(col("price"), "[^\d*\.?\d+]", ""))\
        .withColumn("security_deposit",regexp_replace(col("security_deposit"), "[^\d*\.?\d+]", ""))\
        .withColumn("weekly_price",regexp_replace(col("weekly_price"), "[^\d*\.?\d+]", ""))\
        .withColumn("weekly_price",regexp_replace(col("weekly_price"), "N/A", ""))\
        .withColumn("host_response_rate",regexp_replace(col("host_response_rate"), "[^\d+]", ""))
print("cleaning_fee Head")
df.show()
datasource1 = DynamicFrame.fromDF(df, glueContext, "nested")

## @type: ApplyMapping
## @args: [mapping = [("id", "long", "id", "long"), ("last_scraped", "string", "last_scraped", "timestamp"), ("host_id", "long", "host_id", "long"), ("host_name", "string", "host_name", "string"), ("host_since", "string", "host_since", "string"), ("host_response_time", "string", "host_response_time", "string"), ("host_response_rate", "string", "host_response_rate", "float"), ("host_neighbourhood", "string", "host_neighbourhood", "string"), ("host_total_listings_count", "double", "host_total_listings_count", "double"), ("host_identity_verified", "string", "host_identity_verified", "string"), ("neighbourhood_group_cleansed", "string", "neighbourhood_group_cleansed", "string"), ("city", "string", "city", "string"), ("state", "string", "state", "string"), ("zipcode", "long", "zipcode", "long"), ("country_code", "string", "country_code", "string"), ("country", "string", "country", "string"), ("latitude", "double", "latitude", "double"), ("longitude", "double", "longitude", "double"), ("property_type", "string", "property_type", "string"), ("room_type", "string", "room_type", "string"), ("accommodates", "long", "accommodates", "long"), ("bathrooms", "double", "bathrooms", "double"), ("bedrooms", "double", "bedrooms", "double"), ("beds", "double", "beds", "double"), ("bed_type", "string", "bed_type", "string"), ("square_feet", "double", "square_feet", "double"), ("price", "string", "price", "float"), ("weekly_price", "string", "weekly_price", "float"), ("monthly_price", "string", "monthly_price", "float"), ("security_deposit", "string", "security_deposit", "float"), ("cleaning_fee", "string", "cleaning_fee", "float"), ("guests_included", "long", "guests_included", "long"), ("extra_people", "string", "extra_people", "float"), ("number_of_reviews", "long", "number_of_reviews", "long"), ("first_review", "string", "first_review", "timestamp"), ("last_review", "string", "last_review", "timestamp"), ("review_scores_value", "double", "review_scores_value", "double"), ("cancellation_policy", "string", "cancellation_policy", "string")], transformation_ctx = "applymapping1"]
## @return: applymapping1
## @inputs: [frame = datasource0]
    return SparseVector(x.size, [(k, v) for (k, v) in zip(x.indices, x.values) if v >= t])

if __name__ == "__main__":
    argparser = argparse.ArgumentParser(description='Cluster features')
    argparser.add_argument('-c', '--minCount', type=int, default=1.0)
    argparser.add_argument('-s', '--clusterSize', type=int, default=1)
    argparser.add_argument('indir', help='Input directory')
    argparser.add_argument('outdir', help='Output directory')
    args = argparser.parse_args()

    spark = SparkSession.builder.appName('Cluster Features').getOrCreate()

    df = spark.read.load(args.indir)

    raw = df.filter(col('size') >= args.clusterSize) \
            .select('cluster', 'size', regexp_replace('text', u'\xad\s*', '').alias('text'))
    raw.cache()

    tok = RegexTokenizer(inputCol='text', outputCol='terms', gaps=False, pattern='\w+') \
          .transform(raw)
    counts = CountVectorizer(inputCol='terms', outputCol='counts', minDF=2.0) \
             .fit(tok).transform(tok)
    
    mergeCounts = udf(lambda va, size: threshold_sparse(scale_sparse(reduce(add_sparse, va), 1.0/size), args.minCount),
                      VectorUDT())

    res = counts.groupBy('cluster', 'size') \
                .agg(mergeCounts(collect_list('counts'), 'size').alias('counts'))

    # lda = LDA(k=2, featuresCol='counts', seed=1, optimizer='em')
    # model = lda.fit(res)
Exemple #34
0
def text_formatting(spark):
    """ Extract formatting features from the text of a post

    Args:
        spark (SparkSession): used to run queries and commands

    Returns:
        DataFrame: With columns [
            (post)_Id,
            #codelines,
            #html_blocks,
            #headings,
            #referencelist,
            #quotes,
            #codeblocks,
            #themebreaks,
            #codespans,
            #references,
            #links,
            #inline_images,
            #mail_addresses,
            #emphasis,
            #strong
        ]
    """
    # Replaces formatted text that has already been processed
    FILLER = 'x'
    # Parser helper column
    COLNAME = 'processed_text'
    COL = col(COLNAME)

    # Data loading
    post_history_df = spark.read.parquet("/user/***REMOVED***/StackOverflow/PostHistory.parquet") \
        .select(['_PostId', '_Text', '_PostHistoryTypeId']) \
        .filter(col('_PostHistoryTypeId') == 2) \
        .drop('_PostHistoryTypeId')
    post_df = spark.read.parquet('/user/***REMOVED***/StackOverflow/Posts.parquet') \
        .select(['_Id', '_PostTypeId']) \
        .filter(col('_PostTypeId') == 1) \
        .drop("_PostTypeId")
    df = post_history_df.join(post_df,
                              post_df['_Id'] == post_history_df['_PostId'])

    # Count lines and words of the formatted text
    df = df.withColumn('#lines', size(split(col('_Text'), r'\n'))) \
        .withColumn('#words', size(split(col('_Text'), r'\s+')))

    # BLOCK ELEMENTS
    # Count code lines
    df = df.withColumn(COLNAME, split(col('_Text'), regex.CODE_BLOCK_RE)) \
        .withColumn('#codelines', size(COL) - 1) \
        .withColumn('codeline_ratio', col('#codelines') / col('#lines')) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    # Count HTML blocks
    df = df.withColumn(COLNAME, split(COL, regex.HTML_BLOCK_RE)) \
        .withColumn('#html_blocks', size(COL) - 1) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    # # Count headings (1/2)
    df = df.withColumn(COLNAME, split(COL, regex.SETEXT_HEADING_RE)) \
        .withColumn('#headings', size(COL) - 1) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    # Count reference list
    df = df.withColumn(COLNAME, split(COL, regex.REFERENCE_LIST_RE)) \
        .withColumn('#referencelist', size(COL) - 1) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    # Count quotes
    df = df.withColumn(COLNAME, split(COL, regex.QUOTE_RE)) \
        .withColumn('#quotes', size(COL) - 1) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    # Count headings (2/2)
    df = df.withColumn(COLNAME, split(COL, regex.HEADING_RE)) \
        .withColumn('#headings', size(COL) - 1 + col('#headings')) \
        .withColumn('heading_ratio', col('#headings') / col('#lines')) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    # Count code blocks
    df = df.withColumn(COLNAME, split(COL, regex.FENCED_CODE_RE)) \
        .withColumn('#codeblocks', size(COL) - 1) \
        .withColumn('codeblock_ratio', col('#codeblocks') / col('#lines')) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    # Count thematic break
    df = df.withColumn(COLNAME, split(COL, regex.THEME_BREAK_RE)) \
        .withColumn('#themebreaks', size(COL) - 1) \
        .withColumn('themebreak_ratio', col('#themebreaks') / col('#lines')) \
        .withColumn(COLNAME, array_join(COL, FILLER))

    # INLINE ELEMENTS
    # Count codespans
    df = df.withColumn(COLNAME, split(COL, regex.CODESPAN_RE)) \
        .withColumn('#codespans', size(COL) - 1) \
        .withColumn('codespan_ratio', col('#codespans') / col('#words')) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    # Remove markdown escapes
    df = df.withColumn(COLNAME, regexp_replace(COL, regex.ESCAPE_RE, FILLER))
    # Count references (1/2)
    df = df.withColumn(COLNAME, split(COL, regex.REFERENCE_RE)) \
        .withColumn('#references', size(COL) - 1) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    # Count links (1/2)
    df = df.withColumn(COLNAME, split(COL, regex.LINK_RE)) \
        .withColumn('#links', size(COL) - 1) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    # Count inline images
    df = df.withColumn(COLNAME, split(COL, regex.INLINE_IMAGE_RE)) \
        .withColumn('#inline_images', size(COL) - 1) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    # # Count references (2/2)
    df = df.withColumn(COLNAME, split(COL, regex.SHORT_REFERENCE_RE)) \
        .withColumn('#references', size(COL) - 1 + col('#references')) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    # Count links (2/2)
    df = df.withColumn(COLNAME, split(COL, regex.AUTOLINK_RE)) \
        .withColumn('#links', size(COL) - 1 + col('#links')) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    # Count mails
    df = df.withColumn(COLNAME, split(COL, regex.AUTOMAIL_RE)) \
        .withColumn('#mail_addresses', size(COL) - 1) \
        .withColumn(COLNAME, array_join(COL, FILLER))

    # Remove line breaks, html, stand-alone * or _
    df = df.withColumn(COLNAME, regexp_replace(COL, regex.LINE_BREAK_RE,
                                               FILLER))
    df = df.withColumn(COLNAME, regexp_replace(COL, regex.HTML_RE, FILLER))
    df = df.withColumn(COLNAME, regexp_replace(COL, regex.NOT_STRONG_RE,
                                               FILLER))
    # Count strong & emphasis
    df = df.withColumn(COLNAME, split(COL, regex.EM_STRONG_RE)) \
        .withColumn('#emphasis', size(COL) - 1) \
        .withColumn('#strong', size(COL) - 1) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    df = df.withColumn(COLNAME, split(COL, regex.STRONG_EM_RE)) \
        .withColumn('#emphasis', size(COL) - 1 + col('#emphasis')) \
        .withColumn('#strong', size(COL) - 1 + col('#strong')) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    df = df.withColumn(COLNAME, split(COL, regex.STRONG_EM3_RE)) \
        .withColumn('#emphasis', size(COL) - 1 + col('#emphasis')) \
        .withColumn('#strong', size(COL) - 1 + col('#strong')) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    df = df.withColumn(COLNAME, split(COL, regex.STRONG_RE)) \
        .withColumn('#strong', size(COL) - 1 + col('#strong')) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    df = df.withColumn(COLNAME, split(COL, regex.EMPHASIS_RE)) \
        .withColumn('#emphasis', size(COL) - 1 + col('#emphasis')) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    df = df.withColumn(COLNAME, split(COL, regex.EM_STRONG2_RE)) \
        .withColumn('#emphasis', size(COL) - 1 + col('#emphasis')) \
        .withColumn('#strong', size(COL) - 1 + col('#strong')) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    df = df.withColumn(COLNAME, split(COL, regex.STRONG2_RE)) \
        .withColumn('#strong', size(COL) - 1 + col('#strong')) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    df = df.withColumn(COLNAME, split(COL, regex.EMPHASIS2_RE)) \
        .withColumn('#emphasis', size(COL) - 1 + col('#emphasis')) \
        .withColumn(COLNAME, array_join(COL, FILLER))
    df = df.withColumn('emphasis_ratio', col('#emphasis') / col('#words')) \
        .withColumn('strong_ratio', col('#strong') / col('#words'))

    # Remove unnecessary columns, including parser helper column
    df = df.drop('_Text', '_PostHistoryTypeId', '_PostId', '#lines', '#words',
                 COLNAME)
    return df
Exemple #35
0
def prepareDatasets(sc, spark):
    buisHeader = ['business_id', 'name', 'neighborhood', 'address', 'city', 'state', 'postal_code',
        'latitude', 'longitude', 'stars', 'review_count', 'is_open', 'categories']
    buis = sc.textFile(datapath+'yelp_business.csv', use_unicode=False)
    buis = buis.filter(lambda row: not row.startswith('business_id,name'))\
        .map(lambda row: re.findall(r'(?:[^,"]|"(?:\\.|[^"])*")+', row.replace(',,', ', ,')))\
        .map(lambda row: map(lambda x: x.replace('"', ''), row))\
        .map(lambda row: dict(zip(buisHeader, row)))\
        .filter(lambda row: row['business_id'] and row['longitude'] and row['latitude'])\
        .filter(lambda row: row['business_id'].strip() and row['longitude'].strip() and row['latitude'].strip())\
        .toDF()
    buis = buis.select('business_id', 'name', 'city', 'state', 'postal_code', 'categories',
                        buis['latitude'].cast('float'), buis['longitude'].cast('float'),
                        buis['stars'].cast('float'), buis['review_count'].cast('int'),
                        buis['is_open'].cast('int'))\
        .dropna(how='any', subset=['business_id','longitude', 'latitude'])

    def reviews_mapper(index, lines):
        import csv
        reader = csv.reader(lines)
        if index==0: lines.next()
        for row in reader:
            if len(row) == 9 and len(row[1])==22:
                yield row
    reviewsHeader = ["review_id","user_id","business_id","stars","date","text","useful","funny","cool"]
    reviews = sc.textFile(datapath+'yelp_review.csv', use_unicode=False)\
        .mapPartitionsWithIndex(reviews_mapper)\
        .map(lambda x: dict(zip(reviewsHeader, x)))\
        .toDF()
    reviews = reviews.select(
        "review_id", "user_id", "business_id", "text",
        reviews["stars"].cast('float'), reviews["date"].cast('date'),
        reviews["useful"].cast('int'), reviews["funny"].cast('int'),
        reviews["cool"].cast('int'))\
        .filter(reviews.text.isNotNull())\
        .filter(reviews.business_id.isNotNull())
    reviews = reviews.alias('a').join(buis.alias('b'),
        sf.col('b.business_id') == sf.col('a.business_id'))\
        .select('b.*','a.text') #,'a.user_id')
    reviews = reviews.where(
        'longitude > {:f} and longitude < {:f} and latitude > {:f} and latitude < {:f}'\
        .format(westAMER, eastAMER, southAMER, northAMER)
    ).cache()

    id_text = reviews.select('business_id', 'text')\
        .groupBy('business_id').agg(sf.concat_ws(' ', sf.collect_list("text")).alias('text_concat'))
    reviews = reviews.drop(reviews.text)\
        .select('business_id','categories','state', 'stars')\
        .alias('a').join(id_text.alias('b'),
        sf.col('b.business_id') == sf.col('a.business_id'))\
        .select('a.*','b.text_concat')\
        .distinct()\
        .withColumnRenamed('text_concat', 'text')

    # some data cleansing:
    reviews = reviews.withColumn('text', sf.regexp_replace(reviews.text, '\\/', '/'))
    def cleanse(text):
        re_punc = re.compile('[' + re.escape(punctuation) + '0-9\\n\\t\\r]')
        re_spc = re.compile('[ ]+') # get rid of extra spaces
        return re_spc.sub(' ', re_punc.sub(" ", text))
    cleanser = sf.udf(lambda x: cleanse(x))
    reviews = reviews.withColumn('text', cleanser('text'))
    # tokinizing and removing stop words:
    import pyspark.ml.feature as sparkml
    from pyspark.ml import Pipeline
    tokenizer = sparkml.Tokenizer(inputCol="text", outputCol="words")
    swremover = sparkml.StopWordsRemover(inputCol='words', outputCol='words_clean')
    pipeline = Pipeline(stages=[tokenizer, swremover])
    reviews = pipeline.fit(reviews).transform(reviews)
    reviews = reviews.drop('text', 'words')
    return reviews.cache()
def test_auto_mapper_fhir_patient_resource(
        spark_session: SparkSession) -> None:
    # Arrange
    spark_session.createDataFrame(
        [
            (1, "Qureshi", "Imran", "1970-01-01", "female"),
            (2, "Vidal", "Michael", "1970-02-02", None),
        ],
        ["member_id", "last_name", "first_name", "date_of_birth", "my_gender"],
    ).createOrReplaceTempView("patients")

    source_df: DataFrame = spark_session.table("patients")

    df = source_df.select("member_id")
    df.createOrReplaceTempView("members")

    # Act
    mapper = AutoMapper(view="members",
                        source_view="patients",
                        keys=["member_id"]).complex(
                            Patient(
                                id_=FhirId(A.column("member_id")),
                                birthDate=A.date(A.column("date_of_birth")),
                                name=FhirList([
                                    HumanName(use=NameUseCode("usual"),
                                              family=A.column("last_name"))
                                ]),
                                gender=A.if_not_null(
                                    A.column("my_gender"),
                                    AdministrativeGenderCode(
                                        A.column("my_gender"))),
                            ))

    assert isinstance(mapper, AutoMapper)
    sql_expressions: Dict[str, Column] = mapper.get_column_specs(
        source_df=source_df)
    for column_name, sql_expression in sql_expressions.items():
        print(f"{column_name}: {sql_expression}")

    result_df: DataFrame = mapper.transform(df=df)

    # Assert
    assert len(sql_expressions) == 5
    assert str(sql_expressions["id"]) == str(
        substring(regexp_replace(col("b.member_id"), r"[^A-Za-z0-9\-\.]", "-"),
                  0, 63).cast("string").alias("id"))
    assert str(sql_expressions["resourceType"]) == str(
        lit("Patient").cast("string").alias("resourceType"))
    assert str(sql_expressions["birthDate"]) == str(
        coalesce(
            to_date(col("b.date_of_birth"), "y-M-d"),
            to_date(col("b.date_of_birth"), "yyyyMMdd"),
            to_date(col("b.date_of_birth"), "M/d/y"),
        ).cast("date").alias("birthDate"))
    # assert str(sql_expressions["name"]) == str(
    #     filter(
    #         array(
    #             struct(
    #                 lit("usual").alias("use"),
    #                 col("b.last_name").alias("family"),
    #             )
    #         ), lambda x: x.isNotNull()
    #     ).alias("name")
    # )
    # assert str(sql_expressions["gender"]) == str(
    #     when(col("b.my_gender").isNull(),
    #          None).otherwise(col("b.my_gender")).alias("gender")
    # )

    result_df.printSchema()
    result_df.show()

    assert (result_df.where("member_id == 1").selectExpr(
        "name[0].use").collect()[0][0] == "usual")
    assert (result_df.where("member_id == 1").selectExpr(
        "name[0].family").collect()[0][0] == "Qureshi")

    assert (result_df.where("member_id == 2").selectExpr(
        "name[0].use").collect()[0][0] == "usual")
    assert (result_df.where("member_id == 2").selectExpr(
        "name[0].family").collect()[0][0] == "Vidal")
Exemple #37
0
    lpad(lit("HELLO"), 3, " ").alias("lp"),
    rpad(lit("HELLO"), 10, " ").alias("rp")).show(2)
# -- in SQL
# SELECT
# ltrim(' HELLLOOOO '),
# rtrim(' HELLLOOOO '),
# trim(' HELLLOOOO '),
# lpad('HELLOOOO ', 3, ' '),
# rpad('HELLOOOO ', 10, ' ')
# FROM dfTable

# regexp
from pyspark.sql.functions import regexp_replace
regex_string = "BLACK|WHITE|RED|GREEN|BLUE"
df.select(
    regexp_replace(col("Description"), regex_string,
                   "COLOR").alias("color_clean"), col("Description")).show(2)
# -- in SQL
# SELECT
#   regexp_replace(Description, 'BLACK|WHITE|RED|GREEN|BLUE', 'COLOR') as
#   color_clean, Description
# FROM dfTable

# replace strings
from pyspark.sql.functions import translate
df.select(translate(col("Description"), "LEET", "1337"),col("Description"))\
  .show(2)
# -- in SQL
# SELECT translate(Description, 'LEET', '1337'), Description FROM dfTable

# regexp_extract
from pyspark.sql.functions import regexp_extract
def process(rdd):
    start = time.time()
    global accuracy
    global completed
    # Get the singleton instance of SparkSession
    spark = getSparkSessionInstance(rdd.context.getConf())
    # Convert RDD[String] to RDD[Row] to DataFrame
    rowRdd = rdd.map(
        lambda x: Row(Summons_Number=str(x[0]),
                      Registration_State=str(x[2]),
                      Plate_Type=str(x[3]),
                      Violation_Code=str(x[5]),
                      Vehicle_Body_Type=str(x[6]),
                      Vehicle_Make=str(x[7]),
                      Issuing_Agency=str(x[8]),
                      Street_Code1=str(x[9]),
                      Street_Code2=str(x[10]),
                      Street_Code3=str(x[11]),
                      Violation_County=str(x[13]),
                      Issuer_Precinct=str(x[14]),
                      Issuer_Command=str(x[16]),
                      Issuer_Squad=str(x[17]),
                      Violation_In_Front_Of_Or_Opposite=str(x[21]),
                      Issue_Date=str(x[4]),
                      Violation_Time=str(x[18]),
                      Violation_Location=str(x[20])))
    df = spark.createDataFrame(rowRdd)
    ############################################## PREPROCESSING ##################################
    #Splitting the issue date into month,year,day
    df_new = df.withColumn('Month',
                           split('Issue_Date', '/')[0]).withColumn(
                               'Year',
                               split('Issue_Date', '/')[2]).withColumn(
                                   'Day',
                                   day_udf(col('Issue_Date'))).withColumn(
                                       'Time', time_udf(col('Violation_Time')))
    #converting the columns into integers
    df_new = df_new.withColumn("Year", df_new["Year"].cast(
        IntegerType())).withColumn("Month", df_new["Month"].cast(
            DoubleType())).withColumn("Day", df_new["Day"].cast(
                DoubleType())).withColumn("Time",
                                          df_new["Time"].cast(DoubleType()))
    #Removing outliers and some filtering
    df_new = df_new.drop(
        *['Issue_Date', 'Violation_Time', 'Year', 'Issuer_Squad'])
    #Filling na
    df_new = df_new.fillna({'Time': 3})
    #Removing na locaions of violation location and violation count
    df_new = df_new.dropna(how='any',
                           subset=['Violation_Location', 'Violation_County'])
    df_new = df_new.dropna(how='any')
    #Mapping violation location
    df_new = df_new.withColumn('Violation_Location', regexp_replace('Violation_Location', 'KINGS', 'K'))\
    .withColumn('Violation_Location', regexp_replace('Violation_Location', 'KING', 'K'))\
    .withColumn('Violation_Location', regexp_replace('Violation_Location', 'QUEEN', 'Q'))\
    .withColumn('Violation_Location', regexp_replace('Violation_Location', 'QU', 'Q'))\
    .withColumn('Violation_Location', regexp_replace('Violation_Location', 'NEWY', 'NY'))\
    .withColumn('Violation_Location', regexp_replace('Violation_Location', 'NEW Y', 'NY'))\
    .withColumn('Violation_Location', regexp_replace('Violation_Location', 'MAN', 'NY'))\
    .withColumn('Violation_Location', regexp_replace('Violation_Location', 'MH', 'NY'))\
    .withColumn('Violation_Location', regexp_replace('Violation_Location', 'BRONX', 'BX'))
    # df_new.show()
    ################################################################################################
    # Prediction using saved model
    df_r1 = model.transform(df_new)
    # df_r1.show()
    # df_r1.dropna()
    df_with_cat = df_r1.withColumn("correct",
                                   udfaccuracy("label", "prediction"))
    # df_with_cat.show()
    correct_array = df_with_cat.select(
        "label",
        "prediction").rdd.map(lambda r: int(r[0]) - int(r[1]) == 0).collect()
    num = len(correct_array)
    temp = sum(correct_array)
    completed += num
    accuracy += temp
    end = time.time()
    print("Labels correct till now:{}/{}".format(accuracy, completed))
    print("Completed batch of {} in {}sec".format(num, end - start))
    df_r1.show()
def applyModel(fileName, loadModelName, outlierPercentile = 100):

    sc = SparkContext( 'local', 'pyspark')
    sqlContext = SQLContext(sc)

    #########
    # load data
    #########

    data = sc.textFile(fileName)
    #extract header and remove it
    header = data.first()
    data = data.filter(lambda x:x !=header).cache()
    header = header.split('\t')
    #parse data
    data = data.map(lambda x : x.split('\t'))

    #########
    # prepare features
    #########

    df = sqlContext.createDataFrame(data, header)
    df = (df.withColumn("ADLOADINGTIME",func.regexp_replace('ADLOADINGTIME', 'null', '0').cast('float'))
         .withColumn("TIMESTAMP",func.regexp_replace('TIMESTAMP', 'null', '0').cast('int'))
         .withColumn("GEOIP_LAT",func.regexp_replace('GEOIP_LAT', 'null', '0').cast('int'))
          .withColumn("GEOIP_LNG",func.regexp_replace('GEOIP_LNG', 'null', '0').cast('int'))
          .withColumn("HOSTWINDOWHEIGHT",func.regexp_replace('HOSTWINDOWHEIGHT', 'null', '0').cast('int'))
          .withColumn("HOSTWINDOWWIDTH",func.regexp_replace('HOSTWINDOWWIDTH', 'null', '0').cast('int'))
          .withColumn("TOPMOSTREACHABLEWINDOWHEIGHT",func.regexp_replace('TOPMOSTREACHABLEWINDOWHEIGHT', 'null', '0').cast('int'))
          .withColumn("TOPMOSTREACHABLEWINDOWWIDTH",func.regexp_replace('TOPMOSTREACHABLEWINDOWWIDTH', 'null', '0').cast('int'))
         )
    thr = np.percentile(df.select("ADLOADINGTIME").rdd.collect(), outlierPercentile)
    df = df.filter(func.col('ADLOADINGTIME') < thr)
    df = df.withColumn("TOPMOSTREACHABLEWINDOWAREA", func.col("TOPMOSTREACHABLEWINDOWHEIGHT")*func.col("TOPMOSTREACHABLEWINDOWWIDTH"))
    df = df.withColumn("INTENDENTISACTUALDEVICETYPE", (func.col("ACTUALDEVICETYPE")==func.col("INTENDEDDEVICETYPE")).cast('int'))
    df = df.withColumn("COMBINEDID", 
            func.concat(
                func.col('ACCOUNTID'), 
                func.col('CAMPAIGNID'), 
                func.col('CREATIVEID'), 
                func.col('SDK')) )

    #df = df.withColumn("COMBINEDID", func.regexp_replace("COMBINEDID", '^$', 'NA'))

    df = df.withColumn("COMBINEDEXTERNALID", 
            func.concat( 
                func.regexp_replace('EXTERNALADSERVER', 'null', ''), 
                func.regexp_replace('EXTERNALPLACEMENTID', 'null', ''), 
                func.regexp_replace('EXTERNALSITEID', 'null', ''), 
                func.regexp_replace('EXTERNALSUPPLIERID', 'null', '') ))

    #df = df.withColumn("COMBINEDEXTERNALID", func.regexp_replace("COMBINEDEXTERNALID", '^$', 'NA'))

    df = df.withColumn("PLATFORMCOMBINED", 
            func.concat( 
                func.regexp_replace('PLATFORM', 'null', ''), 
                func.regexp_replace('PLATFORMVERSION', 'null', '') ))

    #df = df.withColumn("PLATFORMCOMBINED", func.regexp_replace("PLATFORMCOMBINED", '^$', 'NA'))

    df = df.withColumn("UA_OSCOMB", 
            func.concat( 
                func.regexp_replace('UA_OS', 'null', ''), 
                func.regexp_replace('UA_OSVERSION', 'null', '') ))

    #df = df.withColumn("UA_OSCOMB", func.regexp_replace("UA_OSCOMB", '^$', 'NA'))
    df = df.withColumn("FILESJSON_SIZE", 
                func.regexp_replace('FILESJSON', '[^,\d]', '') )

    df = df.withColumn("FILESJSON_SIZE", 
                func.regexp_replace('FILESJSON_SIZE', '^,', '') )

    df = df.withColumn("FILESJSON_SIZE", 
                func.regexp_replace('FILESJSON_SIZE', ',,', ',') )

    udf = func.udf(lambda x: int(np.fromstring(x,dtype=int, sep=',').sum()), IntegerType())
    df = df.withColumn("FILESJSON_SIZE", udf("FILESJSON_SIZE"))

    print('Loaded and prapared %d entries' % df.count())

    #########
    # keep only needed features
    #########   

    features = ['ADLOADINGTIME',
     'PLACEMENTID',
     'TIMESTAMP',
     'CREATIVETYPE',
     'UA_HARDWARETYPE',
     'UA_VENDOR',
     'UA_MODEL',
     'UA_BROWSER',
     'UA_BROWSERVERSION',
     'FILESJSON',
     'ERRORSJSON',
     'TOPMOSTREACHABLEWINDOWAREA',
     'FILESJSON_SIZE',
     'COMBINEDID',
     'COMBINEDEXTERNALID',
     'PLATFORMCOMBINED',
     'UA_OSCOMB',
     'SDK',
     'EXTERNALADSERVER'
       ]

    df = df.select(features)

    #########
    # Convert categorical features to numerical
    #########   


    featuresCat = [
     'PLACEMENTID',
     'CREATIVETYPE',
     'UA_HARDWARETYPE',
     'UA_VENDOR',
     'UA_MODEL',
     'UA_BROWSER',
     'UA_BROWSERVERSION',
     'FILESJSON',
     'ERRORSJSON',
     'COMBINEDID',
     'COMBINEDEXTERNALID',
     'PLATFORMCOMBINED',
     'UA_OSCOMB',
     'SDK',
     'EXTERNALADSERVER'
       ]

    for i in range(len(featuresCat)):

        indexer = StringIndexer(inputCol=featuresCat[i], outputCol='_'+featuresCat[i]).setHandleInvalid("skip").fit(df)
        df = indexer.transform(df).drop(featuresCat[i])
        writer = indexer._call_java("write")
        writer.overwrite().save("indexer_" + featuresCat[i])    

    featuresCat = [ '_' + featuresCat[i] for i in range(len(featuresCat))]    

    features = featuresCat[:]
    features.append('TIMESTAMP')    
    features.append('FILESJSON_SIZE')
    features.append('TOPMOSTREACHABLEWINDOWAREA')


    #########
    # Assemble features
    #########   


    assembler = VectorAssembler(
        inputCols=features,
        outputCol="features")

    df = assembler.transform(df)

    #########
    # Convert to labeled point
    #########   


    lp = (df.select(func.col("ADLOADINGTIME").alias("label"), func.col("features"))
      .map(lambda row: LabeledPoint(row.label, row.features)))
    lp.cache()


    #########
    # Load trained model
    #########
    
    model = RandomForestModel.load(sc, loadModelName)
    
    print('Model loaded!')
    
    predictions = model.predict(lp.map(lambda x: x.features)).collect()
    
    return predictions
from pyspark.sql.functions import lit, ltrim, rtrim, rpad, lpad, trim
df.select(
    ltrim(lit("    HELLO    ")).alias("ltrim"),
    rtrim(lit("    HELLO    ")).alias("rtrim"),
    trim(lit("    HELLO    ")).alias("trim"),
    lpad(lit("HELLO"), 3, " ").alias("lp"),
    rpad(lit("HELLO"), 10, " ").alias("rp")).show(2)


# COMMAND ----------

from pyspark.sql.functions import regexp_replace
regex_string = "BLACK|WHITE|RED|GREEN|BLUE"
df.select(
  regexp_replace(col("Description"), regex_string, "COLOR").alias("color_clean"),
  col("Description")).show(2)


# COMMAND ----------

from pyspark.sql.functions import translate
df.select(translate(col("Description"), "LEET", "1337"),col("Description"))\
  .show(2)


# COMMAND ----------

from pyspark.sql.functions import regexp_extract
extract_str = "(BLACK|WHITE|RED|GREEN|BLUE)"
df.select(
    def _transform_data(df_price, df_demanda, df_produccion, df_pinternac,
                        df_subasta):
        """Transform original dataset.

        :param df: Input DataFrame.
        :param redes: Redes file that comes from Investigation Office
        :param entity_: Entity Zurich 'Z' or Another (BANC SABADELL 'BS')
        :return: Transformed DataFrame.
        """
        # Correct Decimals by dots
        bad_columns = [
            'TOTAL_IMPORTACION_ES', 'TOTAL_PRODUCCION_ES',
            'TOTAL_DEMANDA_NAC_ES', 'TOTAL_EXPORTACIONES_ES', 'TOTAL_DDA_ES',
            'TOTAL_POT_IND_ES', 'TOTAL_PRODUCCION_POR', 'TOTAL_DEMANDA_POR'
        ]
        for i in bad_columns:
            df_demanda = (df_demanda.withColumn(i, regexp_replace(
                i, '\\.',
                '')).withColumn(i,
                                regexp_replace(i, ',', '.').cast('float')))

        bad_columns = [
            'HIDRAULICA_CONVENC', 'HIDRAULICA_BOMBEO', 'NUCLEAR',
            'CARBON NACIONAL', 'CARBON_IMPO', 'CICLO_COMBINADO',
            'FUEL_SIN_PRIMA', 'FUEL_PRIMA', 'REG_ESPECIAL'
        ]
        for i in bad_columns:
            df_produccion = (df_produccion.withColumn(
                i, regexp_replace(i, '\\.', '')).withColumn(
                    i,
                    regexp_replace(i, ',', '.').cast('float')))
        # Estos son producción cero o importación cero
        df_produccion = df_produccion.fillna(0)
        df_demanda = df_demanda.fillna(0)

        # Date Variables
        df_price = df_price.select(
            *['ANIO', 'MES', 'DIA', 'HORA', 'PESPANIA', 'PPORTUGAL'])
        funct = udf(lambda x: x.zfill(2), StringType())

        df_price = df_price.withColumn('DIA', funct(df_price['DIA']))
        df_produccion = df_produccion.withColumn('DIA',
                                                 funct(df_produccion['DIA']))
        df_demanda = df_demanda.withColumn('DIA', funct(df_demanda['DIA']))
        df_price = df_price.withColumn('MES', funct(df_price['MES']))
        df_produccion = df_produccion.withColumn('MES',
                                                 funct(df_produccion['MES']))
        df_demanda = df_demanda.withColumn('MES', funct(df_demanda['MES']))

        df_demanda = df_demanda.withColumn(
            'DATE',
            concat(col('DIA'), lit('-'), col('MES'), lit('-'), col('ANIO')))
        df_produccion = df_produccion.withColumn(
            'DATE',
            concat(col('DIA'), lit('-'), col('MES'), lit('-'), col('ANIO')))
        df_price = df_price.withColumn(
            'DATE',
            concat(col('DIA'), lit('-'), col('MES'), lit('-'), col('ANIO')))

        # Group By Day
        df_price = (df_price.groupby('DATE').agg({
            'PESPANIA': 'avg',
            'PPORTUGAL': 'avg'
        }).withColumnRenamed('avg(PESPANIA)', 'PSPAIN').withColumnRenamed(
            'avg(PPORTUGAL)', 'PPORTUGAL'))

        df_demanda = df_demanda.groupby('DATE').sum()
        df_produccion = df_produccion.fillna(0)
        df_produccion = df_produccion.groupby('DATE').sum()

        # SUBASTA
        df = df_price.join(df_subasta, how='left',
                           on='DATE').fillna({'DUMMY': 0})
        delete_var = ['ANIO', 'MES', 'DIA', 'HORA']
        df_demanda = df_demanda.drop(*delete_var)
        df_produccion = df_produccion.drop(*delete_var)
        df = df.drop(*['ANIO', 'DIA', 'HORA'])

        df = df.join(df_demanda, how='left', on='DATE')
        df = df.join(df_produccion, how='left', on='DATE')

        # INTERPOLATE
        df = df.toPandas()
        df_pinternac = df_pinternac.interpolate(limit_direction='backward',
                                                method='nearest')
        df = pd.merge(df,
                      df_pinternac,
                      how='left',
                      left_on='DATE',
                      right_on='FECHA')
        del df['FECHA']
        df['DATE'] = pd.to_datetime(df['DATE'], format='%d-%m-%Y')
        '''
        for colm in ['TAVG', 'TMAX', 'TMIN']:
            df_weather_nor[colm] = ((df_weather_nor[colm] - 32)*5/9).map(int) # Farenheit to Celsius
        '''
        # DUMMY VARS
        df = df.sort_values(by='DATE', ascending=True)
        dummy_var = [
            3, 5, 7, 10, 14, 15, 20, 25, 30, 45, 50, 55, 60, 65, 70, 75, 80,
            85, 90
        ]
        df.loc[df['DUMMY'].isin(['0', 0]), 'DUMMY'] = np.NaN
        for i in dummy_var:
            name = 'DUMMY_BACK_' + str(i) + '_DAY'
            df[name] = pd.Series(df['DUMMY'], index=df.index)
            rows = i
            df[name] = df[name].bfill(axis=0, limit=rows)
            df[name] = df[name].fillna(0)
        for i in dummy_var:
            name = 'DUMMY_FORW_' + str(i) + '_DAY'
            df[name] = pd.Series(df['DUMMY'], index=df.index)
            rows = i
            df[name] = df[name].ffill(axis=0, limit=rows)
            df[name] = df[name].fillna(0)
        df['DUMMY'] = df['DUMMY'].fillna(0)
        df = df.dropna(axis=0, how='any')

        # WORK DAY
        df['DATE'] = pd.to_datetime(df['DATE'], format='%d-%m-%Y')
        df['WEEKDAY'] = df['DATE'].dt.dayofweek
        df['MES'] = df['DATE'].dt.month

        df['WORKDAY'] = pd.Series(0, index=df.index)
        df.loc[df['WEEKDAY'].isin([0, 1, 2, 3, 4]), 'WORKDAY'] = 1
        for i in STRING.feriados_spain:
            df.loc[df['DATE'] == i, 'WORKDAY'] = 0
        del df['WEEKDAY']

        # NULL PRICE
        df['NULL_PRICE'] = pd.Series(0, index=df.index)
        df.loc[df['DATE'].between('2013-03-28', '2013-04-02', inclusive=True),
               'NULL_PRICE'] = 1

        # SUMMER-WINTER
        df['SUMMER'] = pd.Series(0, index=df.index)
        df.loc[df['MES'].isin([7, 8]), 'SUMMER'] = 1
        df['WINTER'] = pd.Series(0, index=df.index)
        df.loc[df['MES'].isin([12, 1]), 'WINTER'] = 1
        del df['MES']
        bool_cols = [
            col for col in df if df[[col]].dropna().isin([0, 1]).all().values
        ]
        for i in df.drop(bool_cols + ['DATE'], axis=1).columns.values.tolist():
            df[i] = df[i].map(float)
            df[i] = df[i].round(2)

        return df