def main():
    spark = SparkSession.builder.getOrCreate()
    frame = extract_demographics(spark, catalog)
    chinese_demographics = frame.filter(lower(col("country")) == "china")
    aggregated_demographics = aggregate_inhabitants_by_province(
        chinese_demographics)
    store_chinese_demographics(aggregated_demographics, catalog)
Example #2
0
    def clean_data(self, list_words_col: str) -> pyspark.sql.DataFrame:
        """Pre-processing data
        Processing executed in function:
        - Lower case
        - Words start with letters or whitespace
        - Remove whitespaces into start and final words
        - Remove rows empty
        - Transform each row in list
        :Args:
            :param list_words_col: column's name of Dataframe
        :Returns:
            :return: Dataframe with a word list in each row
        :Samples:
            +--------------------+
            |               value|
            +--------------------+
            |[over, in, one, n...|
        """
        self.__df_doc = self.__df_doc \
            .withColumn(list_words_col, lower(col(list_words_col))) \
            .withColumn(list_words_col, regexp_replace(str=col(list_words_col),
                                                       pattern='[^a-z ]',
                                                       replacement='')) \
            .withColumn(list_words_col, trim(col(list_words_col))) \
            .filter(self.__df_doc[list_words_col] != "") \
            .withColumn(list_words_col, split(list_words_col, ' '))

        return self
 def _get_by_keywords(self, where_filter: str):
     prog = re.compile(r'"(.*)"')
     counts = []
     query = self.df
     for category in self.terms:
         if category != 'general':
             cat_terms = self.terms[category]
             if cat_terms:
                 for term in cat_terms:
                     match = prog.match(term)
                     if match:
                         print(match.groups())
                         like_expr = '%' + '%'.join(
                             match.groups()).lower() + '%'
                     else:
                         like_expr = '%' + term.lower().replace(
                             '%', '\%').replace(' ', '%').replace('"',
                                                                  '') + '%'
                     col_name = term.lower().replace('%', '\%').replace(
                         ' ', '_').replace('"', '')
                     query = query.withColumn(
                         col_name,
                         sql_funcs.lower(
                             sql_funcs.col('text')).like(like_expr))
                     counts.append(
                         sql_funcs.sum(
                             sql_funcs.when(sql_funcs.col(col_name),
                                            1)).alias(col_name))
     if where_filter == 'none':
         return json.loads(query.select(counts).toJSON().collect()[0])
     else:
         return json.loads(
             query.filter(where_filter).select(counts).toJSON().collect()
             [0])
Example #4
0
def get_tokenized_df(partitioned_df):
    #split into individual words
    tokenizer = Tokenizer(inputCol='body', outputCol='words_token')
    tokenized_but_unsplit_still_has_stop_words_and_punctuation_df = tokenizer\
    .transform(partitioned_df)\
    .select('topic','date_time','month','date', 'words_token')
    
    #remove stop words
    remover = StopWordsRemover(inputCol='words_token', 
                               outputCol='words_no_stops')
    tokenized_but_unsplit_still_has_punctuation_df = remover\
    .transform(tokenized_but_unsplit_still_has_stop_words_and_punctuation_df)\
    .select('topic','words_no_stops','date_time','month','date')
    
    #remove punctuation
    tokenized_but_unsplit_df = tokenized_but_unsplit_still_has_punctuation_df\
    .withColumn('words_and_punct', explode('words_no_stops'))\
    .select('topic','words_and_punct','date_time','month','date')
    
    df_split_has_nonletters_and_uppercase = tokenized_but_unsplit_df\
    .withColumn('word', explode(split(col('words_and_punct'), '[\W_]+')))\
    .select('topic','word','date_time','month','date')
    
    df_split_has_nonletters = df_split_has_nonletters_and_uppercase\
    .withColumn('word', lower(col('word')))
    
    tokenized_and_split_has_duplicates = df_split_has_nonletters\
    .filter(df_split_has_nonletters['word'].rlike('[a-zA-Z]'))
    
    #duplicates dropped to ignore cases of someone using a word in the same post
    tokenized_df = tokenized_and_split_has_duplicates.dropDuplicates()
    
    return tokenized_df
def main(in_dir, out_dir):
    df = spark.read.text(in_dir, wholetext=True)
    df = df.select(f.explode(f.split('value', wordbreak)).alias('word'))
    df = df.select(f.lower('word').alias('word'))
    df = df.groupBy('word').agg(f.count('word').alias('count')).orderBy(
        ['count', 'word'], ascending=False).cache()
    df.write.csv(out_dir, mode='overwrite', compression='none')
Example #6
0
def tf_idf(df, n):
    # Extracting terms per each row/document as a list
    temp_df = df.withColumn(
        'terms',
        f.split(f.lower(f.regexp_replace(df.text_entry, '[^\\w\\s-]', '')),
                ' '))

    # Calculating total number of words per row/document
    temp_df1 = temp_df.withColumn('total_num_words', f.size('terms'))

    # Extracting words in each documents
    temp_df2 = temp_df1.withColumn('token', f.explode('terms'))

    # Calculating tf
    temp_df3 = temp_df2.groupBy('_id', 'token', 'total_num_words').agg({
        'token':
        'count'
    }).withColumnRenamed('count(token)', 'occurrence').sort('_id')
    temp_df4 = temp_df3.withColumn('tf', temp_df3.occurrence)

    # Calculating df
    temp_df5 = temp_df4.groupBy('token').agg(
        f.countDistinct('_id')).withColumnRenamed('count(DISTINCT _id)', 'df')

    # Calculating idf
    temp_df6 = temp_df5.withColumn('idf', f.log10(n / temp_df5.df))

    # Calculating tf-idf
    joined_df = temp_df4.join(temp_df6,
                              temp_df4.token == temp_df6.token).select(
                                  temp_df4.token, temp_df4._id, temp_df4.tf,
                                  temp_df6.df, temp_df6.idf)
    result = joined_df.withColumn('tf_idf', joined_df.tf * joined_df.idf)

    return result
def words_and_vecs_from_csv(spark, model, csv_filename):
    if spark:
        schema = StructType([
            StructField("word1", StringType(), False),
            StructField("word2", StringType(), False)
        ])
        df = spark.read.csv(csv_filename, header=False, schema=schema)
        df = df.select(
            lower(col("word1")).alias("word1"),
            lower(col("word2")).alias("word2"))

        rows = df.collect()
        words1 = [row.word1 for row in rows]
        words2 = [row.word2 for row in rows]

        as_array = udf(lambda s: [s], ArrayType(StringType(), False))
        df = df.withColumn("word1", as_array("word1")).withColumn(
            "word2", as_array("word2"))
        wordvecs1 = [
            row.model for row in model.transform(
                df.withColumnRenamed("word1", "sentence")).collect()
        ]
        wordvecs2 = [
            row.model for row in model.transform(
                df.withColumnRenamed("word2", "sentence")).collect()
        ]
    else:
        words1 = []
        words2 = []
        with open(csv_filename, "r") as csv_file:
            for line in csv_file:
                row = line.strip("\n").split(",")
                words1.append(row[0])
                words2.append(row[1])

        wordvecs1 = [
            model.wv[word1]
            if word1 in model.wv else zeros(model.wv.vector_size)
            for word1 in words1
        ]
        wordvecs2 = [
            model.wv[word2]
            if word2 in model.wv else zeros(model.wv.vector_size)
            for word2 in words2
        ]

    return words1, words2, wordvecs1, wordvecs2
Example #8
0
def prepare_df(path, const, words_to_index):
    rdd = sc.textFile(path)
    row = Row("review")
    df = rdd.map(row).toDF()
    # Clean text
    df_clean = df.select(F.lower(F.regexp_replace(F.col('review'), "n't", " n't")).alias('review'))
    df_clean = df_clean.select(F.lower(F.regexp_replace(F.col('review'), "[^0-9a-zA-Z\\s]", "")).alias('review'))
    # Tokenize text
    tokenizer = Tokenizer(inputCol='review', outputCol='words_token')
    df_words_token = tokenizer.transform(df_clean).select('words_token')
    df_cutted = df_words_token.withColumn('length', F.size(F.col('words_token')))
    # Replace word with it's index
    word_udf = F.udf(lambda row: [words_to_index[w] if w in words_to_index.keys()
                                  else words_to_index["unk"] for w in row],
                 ArrayType(IntegerType()))
    df_stemmed = df_cutted.withColumn('words_stemmed', word_udf(F.col('words_token')))
    return df_stemmed.withColumn("class", F.lit(const))
def removePunctuation(column):

    no_punct = regexp_replace(column, "\p{Punct}", '')
    lowered = lower(no_punct)
    cleaned = trim(lowered)
    return cleaned

    """Removes punctuation, changes to lower case, and strips leading and trailing spaces.
Example #10
0
def join():
    stream_df = spark.read.json('stream_cleanned')
    giant_df = spark.read.json('game_info_base')

    stream_df = stream_df.withColumn('gen_name',
                                     functions.lower(stream_df.game))
    stream_df = stream_df.select('stream_id', 'gen_name').distinct()
    game_info = stream_df.groupBy('gen_name').count().cache()

    real_game_info = game_info.join(
        giant_df, game_info.gen_name == functions.lower(giant_df.name), 'left')

    real_game_info = real_game_info.orderBy(real_game_info.count.desc())

    real_game_info.where(real_game_info.count < 100).count().show()

    real_game_info.coalesce(1).write.json('real_game_info', mode='overwrite')
    def _transform(self, df):

        out_col = self.getOutputCol()
        in_col = self.getInputCol()

        df_lower = df.withColumn(out_col, f.lower(f.col(in_col)))

        return df_lower
Example #12
0
 def lower_column(self, column):
     try:
         self.spark_df = self.spark_df.withColumn('temp', f.lower(f.col(column))).drop(column) \
             .withColumnRenamed('temp', column)
         return self.get_json_df_response()
     except Exception as e:
         print(e)
         return None
Example #13
0
def lower_case_cols(data):
    data_dtypes = {col[0]: col[1] for col in data.dtypes}

    for column in data_dtypes.keys():
        if data_dtypes[column] == 'string':
            data = data.withColumn(column, F.lower(F.col(column)))

    return data
Example #14
0
def preprocess_works(df, output_path):
    """
    简历工作经历信息预处理
    :param df: 工作经历数据,DataFrame数据类型
    :param output_path: 输出文件路径
    :return: 
    """

    # 预处理后输出字段
    output_columns = [
        "resume_id", "company_name", "industry", "position_name",
        "position_title", "position_category", "work_start_date",
        "work_end_date", "work_start_year", "work_end_year", "work_index",
        "salary_min", "salary_max"
    ]

    # 归一化处理公司名称,公司行业,职位名称
    df = df.withColumn(
        "company_name",
        F.lower(F.trim(F.udf(normalization_company_name)(df.company_name))))
    df = df.withColumn(
        "industry",
        F.lower(F.trim(F.udf(normalization_industry)(df.industry_category))))
    df = df.withColumn(
        "position_name",
        F.lower(F.trim(F.udf(normalization_position_name)(df.position_title))))
    df = df.withColumn(
        "position_category",
        F.lower(F.trim(F.udf(get_position_level1)(df.position_name))))
    df = df.withColumn("start_time",
                       F.udf(lambda x: format_date.sub("-", x))(df.start_time))
    df = df.withColumn("end_time",
                       F.udf(lambda x: format_date.sub("-", x))(df.end_time))
    df = df.withColumn("work_start_date",
                       F.date_format(df.start_time, "yyy-MM"))
    df = df.withColumn("work_end_date", F.date_format(df.end_time, "yyy-MM"))
    df = df.withColumn("work_start_year", F.year(df.start_time))
    df = df.withColumn("work_end_year", F.year(df.end_time))
    df = df.withColumn("work_index", df.index)

    # 过滤公司、行业、职位全为空的字段
    df = df.dropna(how="all",
                   subset=["company_name", "industry", "position_name"])

    # 将简历基本信息预处理后的结果保存至指定文件
    df.select(output_columns).write.mode('overwrite').save(output_path)
Example #15
0
def clean_text(c):
    return c
    c = lower(c)
    c = regexp_replace(c, "^rt ", "")
    c = regexp_replace(c, "(https?\://)\S+", "")
    c = regexp_replace(c, "[^a-zA-Z0-9\\s]", "")
    #c = split(c, "\\s+") tokenization...
    return c
def task_5(data_io, product_processed_data, word_0, word_1, word_2):
    # -----------------------------Column names--------------------------------
    # Inputs:
    title_column = 'title'
    PATTERN = ' '
    # Outputs:
    titleArray_column = 'titleArray'
    titleVector_column = 'titleVector'
    # -------------------------------------------------------------------------

    # ---------------------- Your implementation begins------------------------
    product_processed_data_output = product_processed_data.select(
        F.split(F.lower(F.col(title_column)), PATTERN))

    word2Vec = M.feature.Word2Vec(vectorSize=16,
                                  minCount=100,
                                  numPartitions=4,
                                  inputCol="split(lower(title),  )",
                                  outputCol="result",
                                  seed=5)

    model = word2Vec.fit(product_processed_data_output)
    model = word2Vec.fit(product_processed_data_output)
    model = word2Vec.fit(product_processed_data_output)
    model = word2Vec.fit(product_processed_data_output)
    model = word2Vec.fit(product_processed_data_output)
    model = word2Vec.fit(product_processed_data_output)

    #     result = model.transform(title_df)
    #     result
    # -------------------------------------------------------------------------

    # ---------------------- Put results in res dict --------------------------
    res = {
        'count_total': None,
        'size_vocabulary': None,
        'word_0_synonyms': [
            (None, None),
        ],
        'word_1_synonyms': [
            (None, None),
        ],
        'word_2_synonyms': [
            (None, None),
        ]
    }
    # Modify res:
    res['count_total'] = product_processed_data_output.count()
    res['size_vocabulary'] = model.getVectors().count()
    for name, word in zip(
        ['word_0_synonyms', 'word_1_synonyms', 'word_2_synonyms'],
        [word_0, word_1, word_2]):
        res[name] = model.findSynonymsArray(word, 10)
    # -------------------------------------------------------------------------

    # ----------------------------- Do not change -----------------------------
    data_io.save(res, 'task_5')
    return res
def transform_data(spark, input_file, cfg):
    '''
    transform data before writing to delta file
    '''

    staging_fact = spark.read.parquet(input_file)
    dim_user_columns = [
        'cicid', 'i94yr', 'i94mon', 'biryear', 'gender', 'i94cit', 'i94res'
    ]
    dim_user = staging_fact.select(*dim_user_columns)
    user_rename_column = {
        'cicid': 'cicid',
        'i94yr': 'year',
        'i94mon': 'month',
        'biryear': 'birth_year',
        'gender': 'gender',
        'i94cit': 'i94cit',
        'i94res': 'i94res'
    }

    dim_user = dim_user.toDF(
        *[user_rename_column[c] for c in dim_user.columns])
    for c in ['i94cit', 'i94res']:
        dim_user = dim_user.withColumn(
            c, F.substring(F.col(c).cast(T.StringType()), 1, 3))

    map_country_code = spark.read.csv(os.path.join(
        cfg['PATH']['DEV'], 'output/mapping_data/city_code.csv'),
                                      header=True)
    map_residence_code = spark.read.csv(os.path.join(
        cfg['PATH']['DEV'], 'output/mapping_data/residence_code.csv'),
                                        header=True)

    dim_user = dim_user.join(map_country_code, 'i94cit', 'left')
    dim_user = dim_user.join(map_residence_code, 'i94res', 'left')
    dim_user = dim_user.drop(*['i94res', 'i94cit'])

    dim_user = dim_user.withColumn('born_country',
                                   F.lower(F.col('born_country')))
    dim_user = dim_user.withColumn('residence_country',
                                   F.lower(F.col('residence_country')))
    dim_user = dim_user.withColumn('load_data_timestamp',
                                   F.lit(datetime.datetime.now()))

    return dim_user
def customer_email_match():
    ccDF = spark.sql("""
			select dl_filename,dl_line_no, dl_file_prefix,dl_file_timestamp, lower(trim(email_id)) as email_id
			from dl_business.contact_bkp_18062018 where (email_id is not null OR trim(email_id) = '' OR trim(lower(email_id)) = 'null') 
			group by dl_filename,dl_line_no,dl_file_timestamp,dl_file_prefix,email_id
			""")
    ccDF = ccDF.dropDuplicates(
        ['dl_file_prefix', 'dl_file_timestamp', 'dl_line_no', 'email_id'])
    emailexclDF = spark.sql("""
			select lower(trim(email_address)) as email_address from ods_planninginc_data.scv_email_exclusions
			""")
    ccDF = ccDF.join(
        broadcast(emailexclDF), ccDF.email_id == emailexclDF.email_address,
        "left_outer").where(emailexclDF.email_address.isNull()).select(
            [F.col(c) for c in ccDF.columns])
    ccDF = ccDF.withColumn(
        "email_str",
        F.trim(F.lower(F.split(ccDF['email_id'], "@")[0]))).withColumn(
            "email_domain", F.trim(F.lower(F.split(ccDF['email_id'], "@")[1])))
    ccDFFilt = ccDF.filter(
        F.col("email_domain").isNotNull()
        | ~(str_isnumeric_udf(F.col("email_str"))))
    ccDFFilt = ccDFFilt.na.drop(subset=["email_str", "email_domain"])
    ccDFFilt = ccDFFilt.withColumn(
        "std_email_str", email_clean_udf(F.col("email_str"))).withColumn(
            "std_email_domain", email_clean_udf(F.col("email_domain"))).select(
                'dl_filename', 'dl_line_no', 'dl_file_prefix',
                'dl_file_timestamp', 'email_id', 'std_email_str',
                'std_email_domain')
    ccDFFilt.createOrReplaceTempView("contact_clean_data")
    ccDFGrp = spark.sql("""
				select std_email_str,std_email_domain 
				from contact_clean_data 
				group by std_email_str,std_email_domain
				""")
    ccDFGrp.createOrReplaceTempView("contact_uniq_date")
    emlmatchDF = spark.sql("""
				select a.* 
				from contact_clean_data a,contact_uniq_date b 
				where a.std_email_str = b.std_email_str and a.std_email_domain = b.std_email_domain
				""")
    #emlmatchDF.persist()
    #emlmatchDF.createOrReplaceTempView("contact_match_data")
    print("Email Match Step is done")
    return emlmatchDF
Example #19
0
def _clean_string_column(df: DataFrame, column_name: str) -> DataFrame:
    """
    Standardize the characters in a StringType() column. Lower case them, then replace spaces with '_'
    :param df: The dataframe to perform the operation on.
    :param column_name: The column to perform the operation on.
    :return: The same dataframe with the formatted column.
    """
    return df.withColumn(column_name,
                         F.lower(F.regexp_replace(column_name, ' ', '_')))
Example #20
0
def filter_tweets(txt_lst, df, col):
    '''
    returns entries containing a word in txt_lst
    '''
    res = spark.createDataFrame(sc.emptyRDD(), df.schema)
    for keyword in txt_lst:
            filt_tweets = df.filter(f.lower(col).contains(keyword.lower()))
            res = res.union(filt_tweets).distinct()
    return res
Example #21
0
def get_subreddit_topics_df(subreddit_topics_csv):
    # convert subreddit topics csv to df, this will be used to get topics column
    subreddit_topics = spark.read\
    .csv(subreddit_topics_csv, header='true', inferSchema='true')

    subreddit_topics = subreddit_topics\
    .withColumn('subreddit', lower(col('subreddit')))

    return subreddit_topics
Example #22
0
 def execute(self):
     """
     Method classifies the recipes with 'beef' as an ingredient for difficulty. Difficulty is
     classified based on the total time required for prepping and cooking the food. Method
     extends on the super class TransformRecipes classification logic by applying a filter for
     the keyword 'beef' in ingredients column.
     :return:
     """
     return super(BeefRecipes, self).execute().where(lower(col('ingredients')).like("%beef%"))
def create_sentiment(news_df):

    news_df['date'] = pd.to_datetime(news_df['date'])
    news_df = news_df[['date', 'symbol', 'combined_data', "sector_name"]]
    news_df = spark.createDataFrame(news_df)

    news_text_lower = news_df.select(
        news_df['date'], news_df['symbol'],
        functions.lower(news_df['combined_data']).alias("content"),
        news_df["sector_name"])

    #text preprocessing
    df = news_text_lower.withColumn(
        'text_non_asci', strip_non_ascii_udf(news_text_lower['content']))
    rm_df = df.withColumn('clean_text',
                          remove_features_udf(df['text_non_asci']))

    regexTokenizer = RegexTokenizer(gaps=False,
                                    pattern='\w+',
                                    inputCol='clean_text',
                                    outputCol='token')
    stopWordsRemover = StopWordsRemover(inputCol='token',
                                        outputCol='no_stopword')

    my_pipeline = Pipeline(stages=[regexTokenizer, stopWordsRemover])
    reg_model = my_pipeline.fit(rm_df)
    reg_df = reg_model.transform(rm_df)

    reg_joined = reg_df.withColumn("fine_text",
                                   join_udf(reg_df['no_stopword']))
    reg_joined1 = reg_joined.withColumn(
        'eng_text', strip_nonenglish_udf(reg_joined['fine_text']))
    clean_df = reg_joined1.select('date', 'symbol', 'eng_text', 'sector_name')
    #clean_df.show(2)

    #calculate sentiment score

    news_rdd = clean_df.rdd
    lemma_rdd = news_rdd.map(lemma)
    score_rdd = lemma_rdd.map(sentiment_analyzer_scores)
    news_sentiment = spark.createDataFrame(score_rdd).cache()

    news_sentiment = news_sentiment.withColumnRenamed('_1', 'date')
    news_sentiment = news_sentiment.withColumnRenamed('_2', 'symbol')
    news_sentiment = news_sentiment.withColumnRenamed('_3', 'sector_name')
    news_sentiment = news_sentiment.withColumnRenamed('_4', 'sentiment')
    news_sentiment = news_sentiment.withColumn(
        'senti_label', senti_udf(news_sentiment['sentiment']))
    stock_news = news_sentiment.toPandas()

    stock_news['date'] = stock_news['date'].apply(
        lambda x: pd.to_datetime(x).tz_convert('US/Eastern'))
    stock_news['date'] = stock_news['date'].dt.tz_localize(None)
    stock_news['date'] = stock_news['date'].dt.date
    stock_news['date'] = pd.to_datetime(stock_news['date'])
    return (stock_news)
Example #24
0
def task_5(data_io, product_processed_data, word_0, word_1, word_2):
    # -----------------------------Column names--------------------------------
    # Inputs:
    title_column = 'title'
    # Outputs:
    titleArray_column = 'titleArray'
    titleVector_column = 'titleVector'
    # -------------------------------------------------------------------------

    # ---------------------- Your implementation begins------------------------

    step = product_processed_data.select(
        title_column,
        F.lower(F.col(title_column)).alias('temp'))

    split = step.withColumn(titleArray_column,
                            F.split(F.col('temp'), ' ').cast("array<string>"))

    word2vec = M.feature.Word2Vec(minCount=100,
                                  vectorSize=16,
                                  seed=SEED,
                                  numPartitions=4,
                                  inputCol=titleArray_column,
                                  outputCol=titleVector_column)
    model = word2vec.fit(split)

    # -------------------------------------------------------------------------

    # ---------------------- Put results in res dict --------------------------
    res = {
        'count_total': None,
        'size_vocabulary': None,
        'word_0_synonyms': [
            (None, None),
        ],
        'word_1_synonyms': [
            (None, None),
        ],
        'word_2_synonyms': [
            (None, None),
        ]
    }
    # Modify res:

    res['count_total'] = split.count()
    res['size_vocabulary'] = model.getVectors().count()
    for name, word in zip(
        ['word_0_synonyms', 'word_1_synonyms', 'word_2_synonyms'],
        [word_0, word_1, word_2]):
        res[name] = model.findSynonymsArray(word, 10)

    # -------------------------------------------------------------------------

    # ----------------------------- Do not change -----------------------------
    data_io.save(res, 'task_5')
    return res
Example #25
0
	def process(self, cardo_context, cardo_dataframe):
		# type: (CardoContextBase, CardoDataFrame) -> CardoDataFrame
		new_columns = []
		for column in cardo_dataframe.dataframe.schema:
			relevant_formatters = filter(lambda formatter: formatter.is_type(column.dataType), self.__formatters)
			new_column = relevant_formatters[0].convert(cardo_dataframe.dataframe[column.name])
			new_column = F.lower(new_column).alias(column.name)
			new_columns.append(new_column)
		cardo_dataframe.dataframe = cardo_dataframe.dataframe.select(new_columns)
		return cardo_dataframe
Example #26
0
 def is_area_of_study(col):                   #function to check area of study
     @F.udf
     def f1(v):
         l = ['engineering', 'teaching', 'communications', 'animal ccience', 'science & math', 'law & government', 'architecture', 'business', 'culinary arts', 'performing arts', 'health profession', 'visual art & design', 'film/video', 'cosmetology', 'humanities & interdisciplinary', 'computer science & technology', 'project-based learning', 'hospitality, travel, & tourism', 'performing arts/visual art & design', 'environmental science', 'zoned']
         if v in l:
             return True
         else:
             False
     x = col.cast('string')
     return F.count(F.when(f1(F.lower(x)) == True, x))
def getTrumpTwitt(df):
    parseDate = f.udf(lambda x: parser.parse(x), DateType())
    return df.select(
        parseDate(df.created_at).alias('created_at'),
        f.lower(df.text).alias('text_lower'), 'user.id_str',
        'user.screen_name', 'user.followers_count', df.retweet_count, df.id,
        f.col('entities.user_mentions.screen_name').alias('mentioned'),
        f.col('entities.hashtags.text').alias(
            'hashtags')).where((f.col('text_lower').like('%trump%'))
                               & (f.col('place.country_code') == 'US'))
def transform_countries(df: SparkDataFrame) -> SparkDataFrame:
    """Transform countries data
    Add a column of lower case country name for joining purpose

    :param df: the countries data frame to be transformed
    :return: the transformed countries frame
    """

    df = df.withColumn('lower_name', lower(df['name']))
    return df
Example #29
0
def lower_array(df, array_col):
    """
    this function will take an array<string> column in the passed dataframe
    and lowercase each element in the array
    :param df: input dataframe
    :param array_col: name of the array<string> column
    :return: dataframe with array<string> column with lowercased elements
    """
    return df.withColumn(array_col,
                         split(lower(concat_ws(",", col(array_col))), ","))
Example #30
0
def bucket_engine(df):
    eng = F.lower(F.col("engine"))
    return df.withColumn(
        "engine",
        F.when(eng.like("google%"), "google")
        .when(eng.like("ddg%"), "duckduckgo")
        .when(eng.like("duckduckgo%"), "duckduckgo")
        .when(eng.like("bing%"), "bing")
        .otherwise("other"),
    )
Example #31
0
 def preprocess(self, df: DataFrame):
     preprocess_df = df.filter(~F.isnull("se_property")) \
         .withColumn("se_label", F.lower(F.col("se_label"))) \
         .filter(~F.isnull("se_label")) \
         .withColumn("se_property_type", self.classify("event_sub_type", "se_label")) \
         .withColumn("isSuspect", F.col("isSuspect").cast("int")) \
         .filter(F.col("author_id").isNotNull() | F.col("discovery_id").isNotNull()) \
         .drop_duplicates(["event_id", "user_token", "device_id", "user_ipaddress", "isSuspect"]) \
         .withColumn("hour", F.hour("collector_tstamp"))
     return preprocess_df
def removePunctuation(column):
    """Removes punctuation, changes to lower case, and strips leading and trailing spaces.
    Note:
        Only spaces, letters, and numbers should be retained.  Other characters should should be
        eliminated (e.g. it's becomes its).  Leading and trailing spaces should be removed after
        punctuation is removed.
    Args:
        column (Column): A Column containing a sentence.
    Returns:
        Column: A Column named 'sentence' with clean-up operations applied.
    """
    return (trim(regexp_replace(lower(column),'[^a-zA-Z0-9 ]','')).alias('sentence'))
def removePunctuation(column):
    """Removes punctuation, changes to lower case, and strips leading and trailing spaces.

    Note:
        Only spaces, letters, and numbers should be retained.  Other characters should should be
        eliminated (e.g. it's becomes its).  Leading and trailing spaces should be removed after
        punctuation is removed.

    Args:
        column (Column): A Column containing a sentence.

    Returns:
        Column: A Column named 'sentence' with clean-up operations applied.
    """
    #column_val = regexp_replace(column, "\p{Punct}", "")
    #return trim(lower(column_val))
    word = lower(trim(regexp_replace(regexp_replace(column, '[^\w\s]', ''),'_',''))).alias("word")
    return word
def removePunctuation(column):
    """Removes punctuation, changes to lower case, and strips leading and trailing spaces.

    Note:
        Only spaces, letters, and numbers should be retained.  Other characters should should be
        eliminated (e.g. it's becomes its).  Leading and trailing spaces should be removed after
        punctuation is removed.

    Args:
        column (Column): A Column containing a sentence.

    Returns:
        Column: A Column named 'sentence' with clean-up operations applied.
    """
    
#     assert(isinstance(column, pyspark.sql.column.Column))
    assert(str(type(column)) == "<class 'pyspark.sql.column.Column'>")    
    
    columnNoPunct = regexp_replace(column, "[^a-zA-Z0-9 ]", "")
#     columnNoPunct = regexp_replace(column, string.punctuation, "")    
    columnLowerCase = lower(columnNoPunct)
    columnTrimmed = trim(columnLowerCase)
    
    return columnTrimmed
from pyspark.sql.functions import monotonically_increasing_id
df.select(monotonically_increasing_id()).show(2)


# COMMAND ----------

from pyspark.sql.functions import initcap
df.select(initcap(col("Description"))).show()


# COMMAND ----------

from pyspark.sql.functions import lower, upper
df.select(col("Description"),
    lower(col("Description")),
    upper(lower(col("Description")))).show(2)


# COMMAND ----------

from pyspark.sql.functions import lit, ltrim, rtrim, rpad, lpad, trim
df.select(
    ltrim(lit("    HELLO    ")).alias("ltrim"),
    rtrim(lit("    HELLO    ")).alias("rtrim"),
    trim(lit("    HELLO    ")).alias("trim"),
    lpad(lit("HELLO"), 3, " ").alias("lp"),
    rpad(lit("HELLO"), 10, " ").alias("rp")).show(2)


# COMMAND ----------
Example #36
0
# COMMAND ----------

import pyspark.sql.functions as func
from pyspark.sql.functions import col

filtered = dfSmall.filter((col("title") != "<PARSE ERROR>") & col("redirect_title").isNull() & col("text").isNotNull())

# COMMAND ----------

# MAGIC %md
# MAGIC Change all text to lower case.

# COMMAND ----------

lowered = filtered.select("*", func.lower(col("text")).alias("lowerText"))

# COMMAND ----------

parsed = lowered.drop("text").withColumnRenamed("lowerText", "text")

# COMMAND ----------

# MAGIC %md
# MAGIC Split the Wikipedia text into sentences.

# COMMAND ----------

pattern = r"(\. |\n{2,})"
import re
Example #37
0
# COMMAND ----------

import pyspark.sql.functions as func
from pyspark.sql.functions import col
filtered = dfSmall.filter((col('title') != '<PARSE ERROR>') &
                           col('redirect_title').isNull() &
                           col('text').isNotNull())

# COMMAND ----------

# MAGIC %md
# MAGIC Change all text to lower case.

# COMMAND ----------

lowered = filtered.select('*', func.lower(col('text')).alias('lowerText'))

# COMMAND ----------

parsed = (lowered
          .drop('text')
          .withColumnRenamed('lowerText', 'text'))

# COMMAND ----------

# MAGIC %md
# MAGIC Split the Wikipedia text into sentences.

# COMMAND ----------

pattern = r'(\. |\n{2,})'
Example #38
0
from lib.pos_tags import PosTags
from lib.chunks import Chunks

t = Tokens()
p = PosTags()
c = Chunks()

c.train(c.load_training_data("../data/chunker_training_50_fixed.json"))


def pipeline(s):
    """
    Given a string, return a list of relations
    """
    return c.assemble(c.tag(p.tag(t.tokenize(s))))


pipeline_udf = sql.udf(pipeline, types.ArrayType(types.MapType(types.StringType(), types.StringType())))


phrases = (
    notes.withColumn("phrases", pipeline_udf(notes["document"]))
    .select(sql.explode(sql.col("phrases")).alias("text"))
    .filter(sql.col("text")["tag"] == "NP")
    .select(sql.lower(sql.col("text")["phrase"]).alias("phrase"))
    .groupBy(sql.col("phrase"))
    .count()
)

phrases.write.parquet("../data/idigbio_phrases.parquet")
Example #39
0
withCEST = withDate.withColumn('cest_time', func.from_utc_timestamp('timestamp', 'Europe/Amsterdam'))
withCEST.printSchema()

(withCEST
 .select('timestamp', 'cest_time')
 .show(3, False))

# COMMAND ----------

# MAGIC %md
# MAGIC Next, let's convert the text field to lowercase.  We'll use the `lower` function for this.

# COMMAND ----------

lowered = withCEST.select('*', func.lower(col('text')).alias('lowerText'))

print lowered.select('lowerText').first()

# COMMAND ----------

# MAGIC %md
# MAGIC What columns do we have now?

# COMMAND ----------

print lowered.columns

# COMMAND ----------

# MAGIC %md