Python lower Examples, pyspark.sql.functions.lower Python Examples

Example #1

0

Show file

File: 04-Making a function more widely reusable.py Project: chandrainf/Datacamp

def main():
    spark = SparkSession.builder.getOrCreate()
    frame = extract_demographics(spark, catalog)
    chinese_demographics = frame.filter(lower(col("country")) == "china")
    aggregated_demographics = aggregate_inhabitants_by_province(
        chinese_demographics)
    store_chinese_demographics(aggregated_demographics, catalog)

Example #2

0

Show file

    def clean_data(self, list_words_col: str) -> pyspark.sql.DataFrame:
        """Pre-processing data
        Processing executed in function:
        - Lower case
        - Words start with letters or whitespace
        - Remove whitespaces into start and final words
        - Remove rows empty
        - Transform each row in list
        :Args:
            :param list_words_col: column's name of Dataframe
        :Returns:
            :return: Dataframe with a word list in each row
        :Samples:
            +--------------------+
            |               value|
            +--------------------+
            |[over, in, one, n...|
        """
        self.__df_doc = self.__df_doc \
            .withColumn(list_words_col, lower(col(list_words_col))) \
            .withColumn(list_words_col, regexp_replace(str=col(list_words_col),
                                                       pattern='[^a-z ]',
                                                       replacement='')) \
            .withColumn(list_words_col, trim(col(list_words_col))) \
            .filter(self.__df_doc[list_words_col] != "") \
            .withColumn(list_words_col, split(list_words_col, ' '))

        return self

Example #3

0

Show file

File: WordCount.py Project: sjoerdapp/TrendingProductsRecommender

 def _get_by_keywords(self, where_filter: str):
     prog = re.compile(r'"(.*)"')
     counts = []
     query = self.df
     for category in self.terms:
         if category != 'general':
             cat_terms = self.terms[category]
             if cat_terms:
                 for term in cat_terms:
                     match = prog.match(term)
                     if match:
                         print(match.groups())
                         like_expr = '%' + '%'.join(
                             match.groups()).lower() + '%'
                     else:
                         like_expr = '%' + term.lower().replace(
                             '%', '\%').replace(' ', '%').replace('"',
                                                                  '') + '%'
                     col_name = term.lower().replace('%', '\%').replace(
                         ' ', '_').replace('"', '')
                     query = query.withColumn(
                         col_name,
                         sql_funcs.lower(
                             sql_funcs.col('text')).like(like_expr))
                     counts.append(
                         sql_funcs.sum(
                             sql_funcs.when(sql_funcs.col(col_name),
                                            1)).alias(col_name))
     if where_filter == 'none':
         return json.loads(query.select(counts).toJSON().collect()[0])
     else:
         return json.loads(
             query.filter(where_filter).select(counts).toJSON().collect()
             [0])

Example #4

0

Show file

def get_tokenized_df(partitioned_df):
    #split into individual words
    tokenizer = Tokenizer(inputCol='body', outputCol='words_token')
    tokenized_but_unsplit_still_has_stop_words_and_punctuation_df = tokenizer\
    .transform(partitioned_df)\
    .select('topic','date_time','month','date', 'words_token')
    
    #remove stop words
    remover = StopWordsRemover(inputCol='words_token', 
                               outputCol='words_no_stops')
    tokenized_but_unsplit_still_has_punctuation_df = remover\
    .transform(tokenized_but_unsplit_still_has_stop_words_and_punctuation_df)\
    .select('topic','words_no_stops','date_time','month','date')
    
    #remove punctuation
    tokenized_but_unsplit_df = tokenized_but_unsplit_still_has_punctuation_df\
    .withColumn('words_and_punct', explode('words_no_stops'))\
    .select('topic','words_and_punct','date_time','month','date')
    
    df_split_has_nonletters_and_uppercase = tokenized_but_unsplit_df\
    .withColumn('word', explode(split(col('words_and_punct'), '[\W_]+')))\
    .select('topic','word','date_time','month','date')
    
    df_split_has_nonletters = df_split_has_nonletters_and_uppercase\
    .withColumn('word', lower(col('word')))
    
    tokenized_and_split_has_duplicates = df_split_has_nonletters\
    .filter(df_split_has_nonletters['word'].rlike('[a-zA-Z]'))
    
    #duplicates dropped to ignore cases of someone using a word in the same post
    tokenized_df = tokenized_and_split_has_duplicates.dropDuplicates()
    
    return tokenized_df

Example #5

0

Show file

File: wordcount.py Project: kutycoi123/data-science-stuff

def main(in_dir, out_dir):
    df = spark.read.text(in_dir, wholetext=True)
    df = df.select(f.explode(f.split('value', wordbreak)).alias('word'))
    df = df.select(f.lower('word').alias('word'))
    df = df.groupBy('word').agg(f.count('word').alias('count')).orderBy(
        ['count', 'word'], ascending=False).cache()
    df.write.csv(out_dir, mode='overwrite', compression='none')

Example #6

0

Show file

def tf_idf(df, n):
    # Extracting terms per each row/document as a list
    temp_df = df.withColumn(
        'terms',
        f.split(f.lower(f.regexp_replace(df.text_entry, '[^\\w\\s-]', '')),
                ' '))

    # Calculating total number of words per row/document
    temp_df1 = temp_df.withColumn('total_num_words', f.size('terms'))

    # Extracting words in each documents
    temp_df2 = temp_df1.withColumn('token', f.explode('terms'))

    # Calculating tf
    temp_df3 = temp_df2.groupBy('_id', 'token', 'total_num_words').agg({
        'token':
        'count'
    }).withColumnRenamed('count(token)', 'occurrence').sort('_id')
    temp_df4 = temp_df3.withColumn('tf', temp_df3.occurrence)

    # Calculating df
    temp_df5 = temp_df4.groupBy('token').agg(
        f.countDistinct('_id')).withColumnRenamed('count(DISTINCT _id)', 'df')

    # Calculating idf
    temp_df6 = temp_df5.withColumn('idf', f.log10(n / temp_df5.df))

    # Calculating tf-idf
    joined_df = temp_df4.join(temp_df6,
                              temp_df4.token == temp_df6.token).select(
                                  temp_df4.token, temp_df4._id, temp_df4.tf,
                                  temp_df6.df, temp_df6.idf)
    result = joined_df.withColumn('tf_idf', joined_df.tf * joined_df.idf)

    return result

Example #7

0

Show file

File: evaluate_word2vec.py Project: MGabr/evaluate-glint-word2vec

def words_and_vecs_from_csv(spark, model, csv_filename):
    if spark:
        schema = StructType([
            StructField("word1", StringType(), False),
            StructField("word2", StringType(), False)
        ])
        df = spark.read.csv(csv_filename, header=False, schema=schema)
        df = df.select(
            lower(col("word1")).alias("word1"),
            lower(col("word2")).alias("word2"))

        rows = df.collect()
        words1 = [row.word1 for row in rows]
        words2 = [row.word2 for row in rows]

        as_array = udf(lambda s: [s], ArrayType(StringType(), False))
        df = df.withColumn("word1", as_array("word1")).withColumn(
            "word2", as_array("word2"))
        wordvecs1 = [
            row.model for row in model.transform(
                df.withColumnRenamed("word1", "sentence")).collect()
        ]
        wordvecs2 = [
            row.model for row in model.transform(
                df.withColumnRenamed("word2", "sentence")).collect()
        ]
    else:
        words1 = []
        words2 = []
        with open(csv_filename, "r") as csv_file:
            for line in csv_file:
                row = line.strip("\n").split(",")
                words1.append(row[0])
                words2.append(row[1])

        wordvecs1 = [
            model.wv[word1]
            if word1 in model.wv else zeros(model.wv.vector_size)
            for word1 in words1
        ]
        wordvecs2 = [
            model.wv[word2]
            if word2 in model.wv else zeros(model.wv.vector_size)
            for word2 in words2
        ]

    return words1, words2, wordvecs1, wordvecs2

Example #8

0

Show file

File: text_tokenizer.py Project: griddynamics/ml-dsl

def prepare_df(path, const, words_to_index):
    rdd = sc.textFile(path)
    row = Row("review")
    df = rdd.map(row).toDF()
    # Clean text
    df_clean = df.select(F.lower(F.regexp_replace(F.col('review'), "n't", " n't")).alias('review'))
    df_clean = df_clean.select(F.lower(F.regexp_replace(F.col('review'), "[^0-9a-zA-Z\\s]", "")).alias('review'))
    # Tokenize text
    tokenizer = Tokenizer(inputCol='review', outputCol='words_token')
    df_words_token = tokenizer.transform(df_clean).select('words_token')
    df_cutted = df_words_token.withColumn('length', F.size(F.col('words_token')))
    # Replace word with it's index
    word_udf = F.udf(lambda row: [words_to_index[w] if w in words_to_index.keys()
                                  else words_to_index["unk"] for w in row],
                 ArrayType(IntegerType()))
    df_stemmed = df_cutted.withColumn('words_stemmed', word_udf(F.col('words_token')))
    return df_stemmed.withColumn("class", F.lit(const))

Example #9

0

Show file

File: cs105_lab1b_word_count.py Project: Teitlax/ApacheSpark

def removePunctuation(column):

    no_punct = regexp_replace(column, "\p{Punct}", '')
    lowered = lower(no_punct)
    cleaned = trim(lowered)
    return cleaned

    """Removes punctuation, changes to lower case, and strips leading and trailing spaces.

Example #10

0

Show file

def join():
    stream_df = spark.read.json('stream_cleanned')
    giant_df = spark.read.json('game_info_base')

    stream_df = stream_df.withColumn('gen_name',
                                     functions.lower(stream_df.game))
    stream_df = stream_df.select('stream_id', 'gen_name').distinct()
    game_info = stream_df.groupBy('gen_name').count().cache()

    real_game_info = game_info.join(
        giant_df, game_info.gen_name == functions.lower(giant_df.name), 'left')

    real_game_info = real_game_info.orderBy(real_game_info.count.desc())

    real_game_info.where(real_game_info.count < 100).count().show()

    real_game_info.coalesce(1).write.json('real_game_info', mode='overwrite')

Example #11

0

Show file

File: custom_transformers.py Project: canfielder/DSBA-6190_Proj3_Spark_w_Sagemaker

    def _transform(self, df):

        out_col = self.getOutputCol()
        in_col = self.getInputCol()

        df_lower = df.withColumn(out_col, f.lower(f.col(in_col)))

        return df_lower

Example #12

0

Show file

File: service.py Project: DivyaVashistha/hackathon2020

 def lower_column(self, column):
     try:
         self.spark_df = self.spark_df.withColumn('temp', f.lower(f.col(column))).drop(column) \
             .withColumnRenamed('temp', column)
         return self.get_json_df_response()
     except Exception as e:
         print(e)
         return None

Example #13

0

Show file

def lower_case_cols(data):
    data_dtypes = {col[0]: col[1] for col in data.dtypes}

    for column in data_dtypes.keys():
        if data_dtypes[column] == 'string':
            data = data.withColumn(column, F.lower(F.col(column)))

    return data

Example #14

0

Show file

File: resume_works.py Project: reganzm/ai

def preprocess_works(df, output_path):
    """
    简历工作经历信息预处理
    :param df: 工作经历数据，DataFrame数据类型
    :param output_path: 输出文件路径
    :return: 
    """

    # 预处理后输出字段
    output_columns = [
        "resume_id", "company_name", "industry", "position_name",
        "position_title", "position_category", "work_start_date",
        "work_end_date", "work_start_year", "work_end_year", "work_index",
        "salary_min", "salary_max"
    ]

    # 归一化处理公司名称，公司行业，职位名称
    df = df.withColumn(
        "company_name",
        F.lower(F.trim(F.udf(normalization_company_name)(df.company_name))))
    df = df.withColumn(
        "industry",
        F.lower(F.trim(F.udf(normalization_industry)(df.industry_category))))
    df = df.withColumn(
        "position_name",
        F.lower(F.trim(F.udf(normalization_position_name)(df.position_title))))
    df = df.withColumn(
        "position_category",
        F.lower(F.trim(F.udf(get_position_level1)(df.position_name))))
    df = df.withColumn("start_time",
                       F.udf(lambda x: format_date.sub("-", x))(df.start_time))
    df = df.withColumn("end_time",
                       F.udf(lambda x: format_date.sub("-", x))(df.end_time))
    df = df.withColumn("work_start_date",
                       F.date_format(df.start_time, "yyy-MM"))
    df = df.withColumn("work_end_date", F.date_format(df.end_time, "yyy-MM"))
    df = df.withColumn("work_start_year", F.year(df.start_time))
    df = df.withColumn("work_end_year", F.year(df.end_time))
    df = df.withColumn("work_index", df.index)

    # 过滤公司、行业、职位全为空的字段
    df = df.dropna(how="all",
                   subset=["company_name", "industry", "position_name"])

    # 将简历基本信息预处理后的结果保存至指定文件
    df.select(output_columns).write.mode('overwrite').save(output_path)

Example #15

0

Show file

File: names.py Project: markusnissl/recsyschallenge2018

def clean_text(c):
    return c
    c = lower(c)
    c = regexp_replace(c, "^rt ", "")
    c = regexp_replace(c, "(https?\://)\S+", "")
    c = regexp_replace(c, "[^a-zA-Z0-9\\s]", "")
    #c = split(c, "\\s+") tokenization...
    return c

Example #16

0

Show file

File: Sample_1 4.38.05 PM.py Project: BingQZhou/CodeHonestly

def task_5(data_io, product_processed_data, word_0, word_1, word_2):
    # -----------------------------Column names--------------------------------
    # Inputs:
    title_column = 'title'
    PATTERN = ' '
    # Outputs:
    titleArray_column = 'titleArray'
    titleVector_column = 'titleVector'
    # -------------------------------------------------------------------------

    # ---------------------- Your implementation begins------------------------
    product_processed_data_output = product_processed_data.select(
        F.split(F.lower(F.col(title_column)), PATTERN))

    word2Vec = M.feature.Word2Vec(vectorSize=16,
                                  minCount=100,
                                  numPartitions=4,
                                  inputCol="split(lower(title),  )",
                                  outputCol="result",
                                  seed=5)

    model = word2Vec.fit(product_processed_data_output)
    model = word2Vec.fit(product_processed_data_output)
    model = word2Vec.fit(product_processed_data_output)
    model = word2Vec.fit(product_processed_data_output)
    model = word2Vec.fit(product_processed_data_output)
    model = word2Vec.fit(product_processed_data_output)

    #     result = model.transform(title_df)
    #     result
    # -------------------------------------------------------------------------

    # ---------------------- Put results in res dict --------------------------
    res = {
        'count_total': None,
        'size_vocabulary': None,
        'word_0_synonyms': [
            (None, None),
        ],
        'word_1_synonyms': [
            (None, None),
        ],
        'word_2_synonyms': [
            (None, None),
        ]
    }
    # Modify res:
    res['count_total'] = product_processed_data_output.count()
    res['size_vocabulary'] = model.getVectors().count()
    for name, word in zip(
        ['word_0_synonyms', 'word_1_synonyms', 'word_2_synonyms'],
        [word_0, word_1, word_2]):
        res[name] = model.findSynonymsArray(word, 10)
    # -------------------------------------------------------------------------

    # ----------------------------- Do not change -----------------------------
    data_io.save(res, 'task_5')
    return res

Example #17

0

Show file

File: load_dim_user.py Project: Pathairush/data_engineering_nanodegree

def transform_data(spark, input_file, cfg):
    '''
    transform data before writing to delta file
    '''

    staging_fact = spark.read.parquet(input_file)
    dim_user_columns = [
        'cicid', 'i94yr', 'i94mon', 'biryear', 'gender', 'i94cit', 'i94res'
    ]
    dim_user = staging_fact.select(*dim_user_columns)
    user_rename_column = {
        'cicid': 'cicid',
        'i94yr': 'year',
        'i94mon': 'month',
        'biryear': 'birth_year',
        'gender': 'gender',
        'i94cit': 'i94cit',
        'i94res': 'i94res'
    }

    dim_user = dim_user.toDF(
        *[user_rename_column[c] for c in dim_user.columns])
    for c in ['i94cit', 'i94res']:
        dim_user = dim_user.withColumn(
            c, F.substring(F.col(c).cast(T.StringType()), 1, 3))

    map_country_code = spark.read.csv(os.path.join(
        cfg['PATH']['DEV'], 'output/mapping_data/city_code.csv'),
                                      header=True)
    map_residence_code = spark.read.csv(os.path.join(
        cfg['PATH']['DEV'], 'output/mapping_data/residence_code.csv'),
                                        header=True)

    dim_user = dim_user.join(map_country_code, 'i94cit', 'left')
    dim_user = dim_user.join(map_residence_code, 'i94res', 'left')
    dim_user = dim_user.drop(*['i94res', 'i94cit'])

    dim_user = dim_user.withColumn('born_country',
                                   F.lower(F.col('born_country')))
    dim_user = dim_user.withColumn('residence_country',
                                   F.lower(F.col('residence_country')))
    dim_user = dim_user.withColumn('load_data_timestamp',
                                   F.lit(datetime.datetime.now()))

    return dim_user

Example #18

0

Show file

File: customer_match.py Project: ashishforgit/customer-matching

def customer_email_match():
    ccDF = spark.sql("""
			select dl_filename,dl_line_no, dl_file_prefix,dl_file_timestamp, lower(trim(email_id)) as email_id
			from dl_business.contact_bkp_18062018 where (email_id is not null OR trim(email_id) = '' OR trim(lower(email_id)) = 'null') 
			group by dl_filename,dl_line_no,dl_file_timestamp,dl_file_prefix,email_id
			""")
    ccDF = ccDF.dropDuplicates(
        ['dl_file_prefix', 'dl_file_timestamp', 'dl_line_no', 'email_id'])
    emailexclDF = spark.sql("""
			select lower(trim(email_address)) as email_address from ods_planninginc_data.scv_email_exclusions
			""")
    ccDF = ccDF.join(
        broadcast(emailexclDF), ccDF.email_id == emailexclDF.email_address,
        "left_outer").where(emailexclDF.email_address.isNull()).select(
            [F.col(c) for c in ccDF.columns])
    ccDF = ccDF.withColumn(
        "email_str",
        F.trim(F.lower(F.split(ccDF['email_id'], "@")[0]))).withColumn(
            "email_domain", F.trim(F.lower(F.split(ccDF['email_id'], "@")[1])))
    ccDFFilt = ccDF.filter(
        F.col("email_domain").isNotNull()
        | ~(str_isnumeric_udf(F.col("email_str"))))
    ccDFFilt = ccDFFilt.na.drop(subset=["email_str", "email_domain"])
    ccDFFilt = ccDFFilt.withColumn(
        "std_email_str", email_clean_udf(F.col("email_str"))).withColumn(
            "std_email_domain", email_clean_udf(F.col("email_domain"))).select(
                'dl_filename', 'dl_line_no', 'dl_file_prefix',
                'dl_file_timestamp', 'email_id', 'std_email_str',
                'std_email_domain')
    ccDFFilt.createOrReplaceTempView("contact_clean_data")
    ccDFGrp = spark.sql("""
				select std_email_str,std_email_domain 
				from contact_clean_data 
				group by std_email_str,std_email_domain
				""")
    ccDFGrp.createOrReplaceTempView("contact_uniq_date")
    emlmatchDF = spark.sql("""
				select a.* 
				from contact_clean_data a,contact_uniq_date b 
				where a.std_email_str = b.std_email_str and a.std_email_domain = b.std_email_domain
				""")
    #emlmatchDF.persist()
    #emlmatchDF.createOrReplaceTempView("contact_match_data")
    print("Email Match Step is done")
    return emlmatchDF

Example #19

0

Show file

def _clean_string_column(df: DataFrame, column_name: str) -> DataFrame:
    """
    Standardize the characters in a StringType() column. Lower case them, then replace spaces with '_'
    :param df: The dataframe to perform the operation on.
    :param column_name: The column to perform the operation on.
    :return: The same dataframe with the formatted column.
    """
    return df.withColumn(column_name,
                         F.lower(F.regexp_replace(column_name, ' ', '_')))

Example #20

0

Show file

File: main.py Project: Jbudnick/Spark-Case-Study-Public

def filter_tweets(txt_lst, df, col):
    '''
    returns entries containing a word in txt_lst
    '''
    res = spark.createDataFrame(sc.emptyRDD(), df.schema)
    for keyword in txt_lst:
            filt_tweets = df.filter(f.lower(col).contains(keyword.lower()))
            res = res.union(filt_tweets).distinct()
    return res

Example #21

0

Show file

def get_subreddit_topics_df(subreddit_topics_csv):
    # convert subreddit topics csv to df, this will be used to get topics column
    subreddit_topics = spark.read\
    .csv(subreddit_topics_csv, header='true', inferSchema='true')

    subreddit_topics = subreddit_topics\
    .withColumn('subreddit', lower(col('subreddit')))

    return subreddit_topics

Example #22

0

Show file

 def execute(self):
     """
     Method classifies the recipes with 'beef' as an ingredient for difficulty. Difficulty is
     classified based on the total time required for prepping and cooking the food. Method
     extends on the super class TransformRecipes classification logic by applying a filter for
     the keyword 'beef' in ingredients column.
     :return:
     """
     return super(BeefRecipes, self).execute().where(lower(col('ingredients')).like("%beef%"))

Example #23

0

Show file

File: sentiment_generation.py Project: padamshrestha/Stock_Portfolio_Management

def create_sentiment(news_df):

    news_df['date'] = pd.to_datetime(news_df['date'])
    news_df = news_df[['date', 'symbol', 'combined_data', "sector_name"]]
    news_df = spark.createDataFrame(news_df)

    news_text_lower = news_df.select(
        news_df['date'], news_df['symbol'],
        functions.lower(news_df['combined_data']).alias("content"),
        news_df["sector_name"])

    #text preprocessing
    df = news_text_lower.withColumn(
        'text_non_asci', strip_non_ascii_udf(news_text_lower['content']))
    rm_df = df.withColumn('clean_text',
                          remove_features_udf(df['text_non_asci']))

    regexTokenizer = RegexTokenizer(gaps=False,
                                    pattern='\w+',
                                    inputCol='clean_text',
                                    outputCol='token')
    stopWordsRemover = StopWordsRemover(inputCol='token',
                                        outputCol='no_stopword')

    my_pipeline = Pipeline(stages=[regexTokenizer, stopWordsRemover])
    reg_model = my_pipeline.fit(rm_df)
    reg_df = reg_model.transform(rm_df)

    reg_joined = reg_df.withColumn("fine_text",
                                   join_udf(reg_df['no_stopword']))
    reg_joined1 = reg_joined.withColumn(
        'eng_text', strip_nonenglish_udf(reg_joined['fine_text']))
    clean_df = reg_joined1.select('date', 'symbol', 'eng_text', 'sector_name')
    #clean_df.show(2)

    #calculate sentiment score

    news_rdd = clean_df.rdd
    lemma_rdd = news_rdd.map(lemma)
    score_rdd = lemma_rdd.map(sentiment_analyzer_scores)
    news_sentiment = spark.createDataFrame(score_rdd).cache()

    news_sentiment = news_sentiment.withColumnRenamed('_1', 'date')
    news_sentiment = news_sentiment.withColumnRenamed('_2', 'symbol')
    news_sentiment = news_sentiment.withColumnRenamed('_3', 'sector_name')
    news_sentiment = news_sentiment.withColumnRenamed('_4', 'sentiment')
    news_sentiment = news_sentiment.withColumn(
        'senti_label', senti_udf(news_sentiment['sentiment']))
    stock_news = news_sentiment.toPandas()

    stock_news['date'] = stock_news['date'].apply(
        lambda x: pd.to_datetime(x).tz_convert('US/Eastern'))
    stock_news['date'] = stock_news['date'].dt.tz_localize(None)
    stock_news['date'] = stock_news['date'].dt.date
    stock_news['date'] = pd.to_datetime(stock_news['date'])
    return (stock_news)

Example #24

0

Show file

File: dsc102pa2.py Project: jonxsong/ESFSA

def task_5(data_io, product_processed_data, word_0, word_1, word_2):
    # -----------------------------Column names--------------------------------
    # Inputs:
    title_column = 'title'
    # Outputs:
    titleArray_column = 'titleArray'
    titleVector_column = 'titleVector'
    # -------------------------------------------------------------------------

    # ---------------------- Your implementation begins------------------------

    step = product_processed_data.select(
        title_column,
        F.lower(F.col(title_column)).alias('temp'))

    split = step.withColumn(titleArray_column,
                            F.split(F.col('temp'), ' ').cast("array<string>"))

    word2vec = M.feature.Word2Vec(minCount=100,
                                  vectorSize=16,
                                  seed=SEED,
                                  numPartitions=4,
                                  inputCol=titleArray_column,
                                  outputCol=titleVector_column)
    model = word2vec.fit(split)

    # -------------------------------------------------------------------------

    # ---------------------- Put results in res dict --------------------------
    res = {
        'count_total': None,
        'size_vocabulary': None,
        'word_0_synonyms': [
            (None, None),
        ],
        'word_1_synonyms': [
            (None, None),
        ],
        'word_2_synonyms': [
            (None, None),
        ]
    }
    # Modify res:

    res['count_total'] = split.count()
    res['size_vocabulary'] = model.getVectors().count()
    for name, word in zip(
        ['word_0_synonyms', 'word_1_synonyms', 'word_2_synonyms'],
        [word_0, word_1, word_2]):
        res[name] = model.findSynonymsArray(word, 10)

    # -------------------------------------------------------------------------

    # ----------------------------- Do not change -----------------------------
    data_io.save(res, 'task_5')
    return res

Example #25

0

Show file

	def process(self, cardo_context, cardo_dataframe):
		# type: (CardoContextBase, CardoDataFrame) -> CardoDataFrame
		new_columns = []
		for column in cardo_dataframe.dataframe.schema:
			relevant_formatters = filter(lambda formatter: formatter.is_type(column.dataType), self.__formatters)
			new_column = relevant_formatters[0].convert(cardo_dataframe.dataframe[column.name])
			new_column = F.lower(new_column).alias(column.name)
			new_columns.append(new_column)
		cardo_dataframe.dataframe = cardo_dataframe.dataframe.select(new_columns)
		return cardo_dataframe

Example #26

0

Show file

 def is_area_of_study(col):                   #function to check area of study
     @F.udf
     def f1(v):
         l = ['engineering', 'teaching', 'communications', 'animal ccience', 'science & math', 'law & government', 'architecture', 'business', 'culinary arts', 'performing arts', 'health profession', 'visual art & design', 'film/video', 'cosmetology', 'humanities & interdisciplinary', 'computer science & technology', 'project-based learning', 'hospitality, travel, & tourism', 'performing arts/visual art & design', 'environmental science', 'zoned']
         if v in l:
             return True
         else:
             False
     x = col.cast('string')
     return F.count(F.when(f1(F.lower(x)) == True, x))

Example #27

0

Show file

File: Trump_Election_pyspark.py Project: ngantran29/Trump_Tweets_Election

def getTrumpTwitt(df):
    parseDate = f.udf(lambda x: parser.parse(x), DateType())
    return df.select(
        parseDate(df.created_at).alias('created_at'),
        f.lower(df.text).alias('text_lower'), 'user.id_str',
        'user.screen_name', 'user.followers_count', df.retweet_count, df.id,
        f.col('entities.user_mentions.screen_name').alias('mentioned'),
        f.col('entities.hashtags.text').alias(
            'hashtags')).where((f.col('text_lower').like('%trump%'))
                               & (f.col('place.country_code') == 'US'))

Example #28

0

Show file

File: transforming.py Project: nantsou/udacity-data-engineer

def transform_countries(df: SparkDataFrame) -> SparkDataFrame:
    """Transform countries data
    Add a column of lower case country name for joining purpose

    :param df: the countries data frame to be transformed
    :return: the transformed countries frame
    """

    df = df.withColumn('lower_name', lower(df['name']))
    return df

Example #29

0

Show file

File: transforms.py Project: tylarr/pyspark-example-app

def lower_array(df, array_col):
    """
    this function will take an array<string> column in the passed dataframe
    and lowercase each element in the array
    :param df: input dataframe
    :param array_col: name of the array<string> column
    :return: dataframe with array<string> column with lowercased elements
    """
    return df.withColumn(array_col,
                         split(lower(concat_ws(",", col(array_col))), ","))

Example #30

0

Show file

File: helpers.py Project: mozilla/addons_daily

def bucket_engine(df):
    eng = F.lower(F.col("engine"))
    return df.withColumn(
        "engine",
        F.when(eng.like("google%"), "google")
        .when(eng.like("ddg%"), "duckduckgo")
        .when(eng.like("duckduckgo%"), "duckduckgo")
        .when(eng.like("bing%"), "bing")
        .otherwise("other"),
    )

Example #31

0

Show file

 def preprocess(self, df: DataFrame):
     preprocess_df = df.filter(~F.isnull("se_property")) \
         .withColumn("se_label", F.lower(F.col("se_label"))) \
         .filter(~F.isnull("se_label")) \
         .withColumn("se_property_type", self.classify("event_sub_type", "se_label")) \
         .withColumn("isSuspect", F.col("isSuspect").cast("int")) \
         .filter(F.col("author_id").isNotNull() | F.col("discovery_id").isNotNull()) \
         .drop_duplicates(["event_id", "user_token", "device_id", "user_ipaddress", "isSuspect"]) \
         .withColumn("hour", F.hour("collector_tstamp"))
     return preprocess_df

Example #32

0

Show file

File: cs105_lab1b_word_count.py Project: HappyDash/apacheSpark

def removePunctuation(column):
    """Removes punctuation, changes to lower case, and strips leading and trailing spaces.
    Note:
        Only spaces, letters, and numbers should be retained.  Other characters should should be
        eliminated (e.g. it's becomes its).  Leading and trailing spaces should be removed after
        punctuation is removed.
    Args:
        column (Column): A Column containing a sentence.
    Returns:
        Column: A Column named 'sentence' with clean-up operations applied.
    """
    return (trim(regexp_replace(lower(column),'[^a-zA-Z0-9 ]','')).alias('sentence'))

Example #33

0

Show file

File: cs105_lab1b_word_count.py Project: librar127/DS_Courses

def removePunctuation(column):
    """Removes punctuation, changes to lower case, and strips leading and trailing spaces.

    Note:
        Only spaces, letters, and numbers should be retained.  Other characters should should be
        eliminated (e.g. it's becomes its).  Leading and trailing spaces should be removed after
        punctuation is removed.

    Args:
        column (Column): A Column containing a sentence.

    Returns:
        Column: A Column named 'sentence' with clean-up operations applied.
    """
    #column_val = regexp_replace(column, "\p{Punct}", "")
    #return trim(lower(column_val))
    word = lower(trim(regexp_replace(regexp_replace(column, '[^\w\s]', ''),'_',''))).alias("word")
    return word

Example #34

0

Show file

File: cs105_lab1b_word_count.py Project: bdanalytics/Berkeley-Spark

def removePunctuation(column):
    """Removes punctuation, changes to lower case, and strips leading and trailing spaces.

    Note:
        Only spaces, letters, and numbers should be retained.  Other characters should should be
        eliminated (e.g. it's becomes its).  Leading and trailing spaces should be removed after
        punctuation is removed.

    Args:
        column (Column): A Column containing a sentence.

    Returns:
        Column: A Column named 'sentence' with clean-up operations applied.
    """
    
#     assert(isinstance(column, pyspark.sql.column.Column))
    assert(str(type(column)) == "<class 'pyspark.sql.column.Column'>")    
    
    columnNoPunct = regexp_replace(column, "[^a-zA-Z0-9 ]", "")
#     columnNoPunct = regexp_replace(column, string.punctuation, "")    
    columnLowerCase = lower(columnNoPunct)
    columnTrimmed = trim(columnLowerCase)
    
    return columnTrimmed

Example #35

0

Show file

File: Structured_APIs-Chapter_6_Working_with_Different_Types_of_Data.py Project: yehonatc/Spark-The-Definitive-Guide

from pyspark.sql.functions import monotonically_increasing_id
df.select(monotonically_increasing_id()).show(2)


# COMMAND ----------

from pyspark.sql.functions import initcap
df.select(initcap(col("Description"))).show()


# COMMAND ----------

from pyspark.sql.functions import lower, upper
df.select(col("Description"),
    lower(col("Description")),
    upper(lower(col("Description")))).show(2)


# COMMAND ----------

from pyspark.sql.functions import lit, ltrim, rtrim, rpad, lpad, trim
df.select(
    ltrim(lit("    HELLO    ")).alias("ltrim"),
    rtrim(lit("    HELLO    ")).alias("rtrim"),
    trim(lit("    HELLO    ")).alias("trim"),
    lpad(lit("HELLO"), 3, " ").alias("lp"),
    rpad(lit("HELLO"), 10, " ").alias("rp")).show(2)


# COMMAND ----------

Example #36

0

Show file

File: Solutions.py Project: Inscrutive/spark

# COMMAND ----------

import pyspark.sql.functions as func
from pyspark.sql.functions import col

filtered = dfSmall.filter((col("title") != "<PARSE ERROR>") & col("redirect_title").isNull() & col("text").isNotNull())

# COMMAND ----------

# MAGIC %md
# MAGIC Change all text to lower case.

# COMMAND ----------

lowered = filtered.select("*", func.lower(col("text")).alias("lowerText"))

# COMMAND ----------

parsed = lowered.drop("text").withColumnRenamed("lowerText", "text")

# COMMAND ----------

# MAGIC %md
# MAGIC Split the Wikipedia text into sentences.

# COMMAND ----------

pattern = r"(\. |\n{2,})"
import re

Example #37

0

Show file

File: Tutorial.py Project: Inscrutive/spark

# COMMAND ----------

import pyspark.sql.functions as func
from pyspark.sql.functions import col
filtered = dfSmall.filter((col('title') != '<PARSE ERROR>') &
                           col('redirect_title').isNull() &
                           col('text').isNotNull())

# COMMAND ----------

# MAGIC %md
# MAGIC Change all text to lower case.

# COMMAND ----------

lowered = filtered.select('*', func.lower(col('text')).alias('lowerText'))

# COMMAND ----------

parsed = (lowered
          .drop('text')
          .withColumnRenamed('lowerText', 'text'))

# COMMAND ----------

# MAGIC %md
# MAGIC Split the Wikipedia text into sentences.

# COMMAND ----------

pattern = r'(\. |\n{2,})'

Example #38

0

Show file

File: find_phrases_job.py Project: mjcollin/2016spr

from lib.pos_tags import PosTags
from lib.chunks import Chunks

t = Tokens()
p = PosTags()
c = Chunks()

c.train(c.load_training_data("../data/chunker_training_50_fixed.json"))


def pipeline(s):
    """
    Given a string, return a list of relations
    """
    return c.assemble(c.tag(p.tag(t.tokenize(s))))


pipeline_udf = sql.udf(pipeline, types.ArrayType(types.MapType(types.StringType(), types.StringType())))


phrases = (
    notes.withColumn("phrases", pipeline_udf(notes["document"]))
    .select(sql.explode(sql.col("phrases")).alias("text"))
    .filter(sql.col("text")["tag"] == "NP")
    .select(sql.lower(sql.col("text")["phrase"]).alias("phrase"))
    .groupBy(sql.col("phrase"))
    .count()
)

phrases.write.parquet("../data/idigbio_phrases.parquet")

Example #39

0

Show file

File: wiki-eda_answers.py Project: smoltis/spark

withCEST = withDate.withColumn('cest_time', func.from_utc_timestamp('timestamp', 'Europe/Amsterdam'))
withCEST.printSchema()

(withCEST
 .select('timestamp', 'cest_time')
 .show(3, False))

# COMMAND ----------

# MAGIC %md
# MAGIC Next, let's convert the text field to lowercase.  We'll use the `lower` function for this.

# COMMAND ----------

lowered = withCEST.select('*', func.lower(col('text')).alias('lowerText'))

print lowered.select('lowerText').first()

# COMMAND ----------

# MAGIC %md
# MAGIC What columns do we have now?

# COMMAND ----------

print lowered.columns

# COMMAND ----------

# MAGIC %md