Exemple #1
0
    def getReadRDBMSDataFrame():

        from_format = "dd/MMM/yyyy:HH:mm:ss"
        to_format = 'dd-MM-yyyy'
        storeIncrementalValue(0, "ignore")

        df = spark.read.format("csv") \
            .option("inferSchema", True) \
            .option("header", True) \
            .load(argv[17])

        df = df.filter(df.createdt.isNotNull() & (df.createdt != "null"))

        result = df.withColumn("createdt", f.from_unixtime(f.unix_timestamp(f.col("createdt"), from_format), to_format)) \
            .withColumn("value",f.col("value").cast(f.StringType()))\
            .withColumn("score",f.col("score").cast(f.StringType()))\
            .withColumn("regioncode",f.col("regioncode").cast(f.StringType()))\
            .withColumn("status",f.col("status").cast(f.StringType()))\
            .withColumn("count",f.col("count").cast(f.StringType()))\
            .withColumn("statuscode",f.col("statuscode").cast(f.StringType()))\
            .select("id", "username", "amount", "ip", "createdt", "value", "score", "regioncode", "status", "method",
                    "key", "count", "type", "site", "statuscode") \
            .withColumn("rdbms_current_date", f.current_date()) \
            .filter(f.col("id") > getIncrementalValue())

        result = result.withColumn(
            "id",
            f.col("id").cast(f.StringType())).na.fill("Not_Applicable")

        max_value = df.agg({"id": "max"}).collect()[0]
        storeIncrementalValue(max_value["max(id)"], "overwrite")

        result.printSchema()

        return result
Exemple #2
0
def token_score(df, on, value):
    q_val = value
    df = df.select([on])
    df = df.withColumn('query', F.lit(q_val).cast(F.StringType()))
    # TODO: implement the pattent
    pattern = ','
    df = df.withColumn('tokens1', F.split(F.col('left'), pattern))
    df = df.withColumn('tokens2', F.split(F.col('right'), pattern))
    # intersection = tokens1.intersection(tokens2)
    # diff1to2 = tokens1.difference(tokens2) = pure token 1
    # diff2to1 = tokens2.difference(tokens1) = pure token 2
    # TODO: implement an intersect and a diff method
    df = df.withColumn('intersection', F.intersect('tokens1', 'tokens2'))
    df = df.withColumn('diff1to2', F.diff('tokens1', 'tokens2'))
    df = df.withColumn('diff2to1', F.diff('tokens2', 'tokens1'))
    # sorted_sect = " ".join(sorted(intersection))
    # sorted_1to2 = " ".join(sorted(diff1to2))
    # sorted_2to1 = " ".join(sorted(diff2to1))
    # TODO: implement a concat for an array
    df = df.withColumn('sorted_sect',
                       F.concat_ws(' ', F.sort_array('intersection')))
    df = df.withColumn('sorted_1to2 ',
                       F.concat_ws(' ', F.sort_array('diff1to2')))
    df = df.withColumn('sorted_2to1', F.concat_ws(' ',
                                                  F.sort_array('diff2to1')))
    # combined_1to2 = sorted_sect + " " + sorted_1to2 = chain 1 that has been sorted
    # combined_2to1 = sorted_sect + " " + sorted_2to1 = chain 2 that has been sorted
    # TODO: no, i'm joking
    df = df.withColumn('combined_1to2',
                       F.concat_ws(' ', ['sorted_sect', 'sorted_1to2']))
    df = df.withColumn('combined_1to2',
                       F.concat_ws(' ', ['sorted_sect', 'sorted_2to1']))
    # strip
    # sorted_sect = sorted_sect.strip()
    # combined_1to2 = combined_1to2.strip()
    # combined_2to1 = combined_2to1.strip()
    for c in ['sorted_sect', 'combined_1to2', 'combined_2to1']:
        df = df.withColumn(c, F.trim(c))
    # TODO: create a function spark_ratio
    df = df.withColumn(
        'ratio1', spark_ratio(F.col('sorted_sect', F.col('combined_1to2'))))
    df = df.withColumn(
        'ratio2', spark_ratio(F.col('sorted_sect', F.col('combined_2to1'))))
    df = df.withColumn(
        'ratio3', spark_ratio(F.col('combined_2to1', F.col('combined_1to2'))))
    # pairwise = [
    #     ratio_func(sorted_sect, combined_1to2),
    #     ratio_func(sorted_sect, combined_2to1),
    #     ratio_func(combined_1to2, combined_2to1)
    # ]
    df = df.withColumn('max_ratio', F.max(['ratio1', 'ratio2', 'ratio3']))
    df = df.withColumnRenamed('max_ratio', 'token_fuzzy')
    df = df.select(['token_fuzzy'])
    return df
Exemple #3
0
def fuzzyspark(df, on, value):
    q_val = value
    df = df.select([on])
    # TODO: Rework that part
    df = df.withColumn('query', F.lit(q_val).cast(F.StringType()))
    df = df.withColumn(
        'len', F.min(F.length(on),
                     F.lit(len(q_val)).cast(T.IntegerType())))
    df = df.withColumn('levenshtein', F.levenshtein(on, 'query'))
    df = df.withColumn('score', F.col('levenshtein') / F.col('len'))
    df = df.select(['score'])
    return df
Exemple #4
0
def get_prod_table_indexed(prod_table, prod_l_code, prod_l_desc,
                           prod_l_plus_desc, prod_l_cat):
    """
    Fetch product table with the required category level data with the index , prod_l_cat is the new column that is created (passed as a parameter)
    """

    char_replace = F.udf(
        replace_chars, F.StringType()
    )  #udf will run row by row on partition dataframe, but normal full will run on dataframe  # * will serialise the list , ** will serialise the dictionary, krags,kwargs
    if prod_l_plus_desc in trans.columns:
        prod_columns = [prod_l_code, prod_l_desc, prod_l_plus_desc]
        return prod_table.select(prod_columns).withColumn(
            prod_l_cat,
            F.concat(
                F.lit("_"),
                char_replace(prod_l_code, F.lit(r"[^A-Za-z0-9]+"), F.lit(r"")),
                F.lit("_"),
                char_replace(prod_l_desc, F.lit(r"[^A-Za-z]+"), F.lit(r"")),
                F.lit("_"),
                char_replace(prod_l_plus_desc, F.lit(r"[^A-Za-z]+"),
                             F.lit(r"")))).withColumn(
                                 "prod_index",
                                 F.dense_rank().over(
                                     Window.orderBy(prod_l_cat)))

    else:
        prod_columns = [prod_l_code, prod_l_desc]
        return prod_table.select(prod_columns).withColumn(
            prod_l_cat,
            F.concat(
                F.lit("_"),
                char_replace(prod_l_code, F.lit(r"[^A-Za-z0-9]+"), F.lit(r"")),
                F.lit("_"),
                char_replace(prod_l_desc, F.lit(r"[^A-Za-z]+"),
                             F.lit(r"")))).withColumn(
                                 "prod_index",
                                 F.dense_rank().over(
                                     Window.orderBy(prod_l_cat)))
data.cache()

# number of loans falling into each target with percentage
data.groupBy("TARGET").count().withColumn("Percentage", F.col("count") * 100 / data.count()).show()

# number of missing values in each column
counts = [(x, data.filter(F.col(x).isNull()).count()) for x in data.columns]
counts.sort()

# number of columns in  each datatype
[(name, x, round(x * 100.0 / data.count(), 2)) for name, x in counts if x > 0]
print(Counter((x[1] for x in data.dtypes)))

# view unique values in all string columns
str_col_names = [x.name for x in data.schema.fields if x.dataType ==
                 F.StringType()]
unique_df = data.agg(*((F.countDistinct(F.col(c))).alias(c) for c in
                       str_col_names))
unique_df.show()

# describe days employed
data.select('DAYS_EMPLOYED').describe().show()

# describe days birth column
data = data.withColumn("AGE", F.col("DAYS_BIRTH") / -365)
data.select("DAYS_BIRTH", "AGE").describe().show()

# dig deep on days employed

anom = data.filter(F.col('DAYS_EMPLOYED') == 365243)
non_anom = data.filter(F.col('DAYS_EMPLOYED') != 365243)
def main(args):
    # Create a directory to save outputs
    if args.dst.exists():
        logger.info(f'Found an existing destination folder. Deleting...')
        shutil.rmtree(args.dst, ignore_errors=True)

    # Create Spark Session
    if args.spark_driver_mem is not None:
        driver_mem = f'{args.spark_driver_mem}g'
    else:
        driver_mem = '{0}g'.format(int(psutil.virtual_memory().total // 1e9))
    spark = (SparkSession.builder.appName(__name__).config(
        'spark.driver.memory', driver_mem).getOrCreate())

    # Build schema
    schema = t.StructType([
        t.StructField('absolute_url', t.StringType()),
        t.StructField('author', t.StringType()),
        t.StructField('author_str', t.StringType()),
        t.StructField('cluster', t.StringType()),
        t.StructField('date_created', t.DateType()),
        t.StructField('date_modified', t.DateType()),
        t.StructField('download_url', t.StringType()),
        t.StructField('extracted_by_ocr', t.BooleanType()),
        t.StructField('html', t.StringType()),
        t.StructField('html_columbia', t.StringType()),
        t.StructField('html_lawbox', t.StringType()),
        t.StructField('html_with_citations', t.StringType()),
        t.StructField('id', t.LongType()),
        t.StructField('joined_by', t.ArrayType(t.StringType())),
        t.StructField('local_path', t.StringType()),
        t.StructField('opinions_cited', t.ArrayType(t.StringType())),
        t.StructField('page_count', t.IntegerType()),
        t.StructField('per_curiam', t.BooleanType()),
        t.StructField('plain_text', t.BooleanType()),
        t.StructField('resource_uri', t.BooleanType()),
        t.StructField('sha1', t.StringType()),
        t.StructField('type', t.StringType()),
    ])

    # Find all json files
    json_df = spark.read.json(str(args.src), schema=schema, multiLine=True)
    logger.info(f'Processing {json_df.count()} JSON files...')

    # Create UDF to parse html markups
    global parse_html
    parse_html = partial(parse_html, concat_str=args.concat_str)
    parse_html_udf = f.udf(parse_html, f.StringType())

    # Concatenating columns
    logger.info(f"Extracting text columns ({', '.join(args.text_fields)}) and "
                f"concatenating with '{args.concat_str}'...")
    CAT_COL = 'concat'
    # First concat text columns in a row (i.e. JSON file)
    # Then, concat with all rows together
    texts_df = (json_df.select(*args.text_fields).withColumn(
        CAT_COL, f.concat_ws(' ', *args.text_fields)).withColumn(
            CAT_COL, parse_html_udf(CAT_COL)).agg(
                f.collect_list(CAT_COL).alias(CAT_COL)).withColumn(
                    CAT_COL, f.concat_ws(' ', CAT_COL)))

    # Write file
    logger.info(f'Writing text to {args.dst}')
    texts_df.write.text(str(args.dst))
    logger.info(f'Text successfully saved to {args.dst}')
Exemple #7
0
if __name__ == "__main__":
    conf = SparkConf().setAppName('Twitter Parse')

    sc = SparkContext(conf=conf).getOrCreate()

    assert sc.version >= '2.3'  # make sure we have Spark 2.3+

    sqlContext = SQLContext(sc)

    spark = SparkSession.builder.appName('Twitter Parse').getOrCreate()

    spark.sparkContext.setLogLevel('WARN')


    sc.setLogLevel('WARN')
    
    def remove_diacritics(s):
        return unidecode.unidecode(s)

    rem_udf = functions.udf(remove_diacritics, functions.StringType())

    
    #er = EntityResolution(os.path.join(bdenv_loc,'twitter_parse_2018_01.json'), "stopwords.txt")               
    #er.preprocessDF()
    for i in range(1,7):
        er = EntityResolution(os.path.join(bdenv_loc,'twitter_parse_2018_0{}.json'.format(i)), "stopwords.txt")               
        er.preprocessDF()
    
    
    
Exemple #8
0
    "dendogram_index_list_nsovr1.xlsx"
]
temp_time = time()
# prod_l_code, prod_l_desc, prod_l_cat = get_prod_level(prod_cat_level)
prod_l_code = "product_code"
prod_l_desc = "ns_harm"
prod_l_cat = "ns_harm"  #variable that will have values concat of code and desc

print prod_l_code
print prod_l_desc
print prod_l_cat

user = get_user()
print "user: "******"""select * from x5_ru_analysis.kg_SM_karusel_100per2_52weeks
#                                     where item_spend>0 and item_qty >0""")
#trans          = trans.withColumnRenamed("category_id","generaltype_id")
# trans          = trans.where(F.col("store_format")=="TRADITIONAL_TRADE")

# Select distinct product_code, dunn_cat_english_1, group_code
#                                   from x5_ru_analysis.ak_prod_karusel_2
# prod_table     = sqlContext.sql("""
# SELECT DISTINCT PRODUCT_CODE,NS_HARM,NS_HARM AS GROUP_CODE FROM (SELECT A.*,CONCAT('_',GROUP_CODE,'_',DUNN_CAT_ENGLISH) AS NEW_CAT
# FROM x5_ru_analysis.ak_prod_karusel_2 A) A INNER JOIN
# X5_RU_ANALYSIS.AK_NS_MAP_KARUSEL1 B ON A.NEW_CAT = B.KARU_CATEGORY_FINAL
#                                   """)

# prod_table  = prod_table.select([prod_l_code,prod_l_desc,"group_code"]).withColumn(
sc = SparkContext()
ss = SparkSession.builder.master("local").getOrCreate()

rows = sc.textFile("/logs_nasa/NASA_access_log_Jul95").map(parse_interaction)
null_row = lambda row: row is not None
rows = rows.filter(null_row)

df = ss.createDataFrame(rows)
# task1
df_errors = df[df.return_code.like("5%")]
df_errors_length = df_errors.groupBy(["host", "request"]).count()
df_errors_length.coalesce(1).write.csv("task2.csv", mode="append")

#task2
df_requests_fields_combs = df.groupBy(["date", "request_method", "return_code"]).count()
df_requests_fields_combs.coalesce(1).write.csv("task3.csv")

#task3
days = lambda i: i * 86400
df_codes = df[(df.return_code.like("5%") | df.return_code.like("4%"))]
df_codes_grouped = df_codes.groupBy(["return_code", "date"]).count()

udf_myFunction = functions.udf(convert_date, functions.StringType())
df_codes_grouped = df_codes_grouped.withColumn("date", udf_myFunction("date"))
df_codes_grouped = df_codes_grouped.withColumn("date", df_codes_grouped.date.cast("timestamp"))


w = (Window.orderBy(functions.col("date").cast("long")).rangeBetween(-days(7), 0))
df_codes_grouped = df_codes_grouped.withColumn("rolling_average", functions.avg("count").over(w))
df_codes_grouped.coalesce(1).write.csv("task4.csv")
spark = (SparkSession.builder.appName("US departure flight data").config(
    "spark.driver.memory", "8g").getOrCreate())


# %%
def to_date_format_udf(d_str: str):
    l = [char for char in d_str]
    return "".join(l[0:2]) + "/" + "".join(l[2:4]) + " " + " " + "".join(
        l[4:6]) + ":" + "".join(l[6:])


to_date_format_udf("02190925")

# %%
spark.udf.register("to_date_format_udf", to_date_format_udf, T.StringType())

# %%
df = (spark.read.format("csv").schema(
    "date STRING, delay INT, distance INT, origin STRING, destination STRING").
      option("header", "true").option(
          "path", os.path.join(DATA_DIRECTORY, "flights",
                               "departuredelays.csv")).load())

# %%
df.show(5, False)

# %%
df.selectExpr("to_date_format_udf(date) as data_format").show(10,
                                                              truncate=False)
Exemple #11
0
def add_zip5_col(dataframe, column_name="zipcode"):
    get_zip5_udf = F.udf(get_zip5, F.StringType())
    return dataframe.withColumn("zip5", get_zip5_udf(dataframe[column_name]))