Python StopWordsRemover.setOutputCol Examples

Programming Language: Python

Namespace/Package Name: pyspark.ml.feature

Class/Type: StopWordsRemover

Method/Function: setOutputCol

Examples at hotexamples.com: 6

Python StopWordsRemover.setOutputCol - 6 examples found. These are the top rated real world Python examples of pyspark.ml.feature.StopWordsRemover.setOutputCol extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

loadDefaultStopWords(30)

StopWordsRemover(30)

transform(30)

getOutputCol(28)

getStopWords(17)

setStopWords(11)

setInputCol(7)

setOutputCol(6)

drop(1)

extend(1)

getInputCol(1)

show(1)

Example #1

Show file

def task_four(ngram):
    """
    Set the ngram value
    :param ngram:
    :return:
    """
    params = list(inspect.getargspec(task_four))
    p = list(chain.from_iterable([i for i in params if i is not None]))
    param_values = {}
    if len(p) > 0:
        for i, v in enumerate(p):
            try:
                value = raw_input("Please enter a value for {} ==> ".format(v))
                param_values.update({v: value})
            except:
                pass
    ngram = param_values.get(p[0])

    if int(ngram) == 2:
        # --- list of stopwords
        stopwords = {
            'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',
            'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his',
            'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself',
            'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which',
            'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are',
            'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
            'having', 'do', 'does', 'did', 'doing', 'an', 'the', 'and', 'but',
            'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by',
            'for', 'with', 'about', 'against', 'between', 'into', 'through',
            'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up',
            'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again',
            'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why',
            'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most',
            'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same',
            'so', 'than', 'too', 'very', 'can', 'will', 'just', 'don',
            'should', 'now', ' a ', 'insured', 'sured', 'coverage', 'year',
            'dob', 'insd', 'left'
        }

        # --- remove stop words
        REMOVER = StopWordsRemover()
        stopwords = REMOVER.getStopWords()
        REMOVER.setInputCol("inter_wordlist")
        REMOVER.setOutputCol("inter_wordlist_two")

        stpwrds_rmvd_sdf = REMOVER.transform(VECTOR_DATAFRAME) \
                                    .select(["Claim_Id", "filename", "inter_wordlist_two"])

    else:
        pass

Example #2

Show file

def tokenize_df(df): 
    
    tokenizer = Tokenizer(inputCol="text", outputCol="vector")
    remover = StopWordsRemover()
    remover.setInputCol("vector")
    remover.setOutputCol("vector_no_stopw")
    stopwords = remover.getStopWords()
    stemmer = PorterStemmer()
    stemmer_udf = udf(lambda x: stem(x), ArrayType(StringType()))

    df = df.select(clean_text(col("text")).alias("text"))
    df = tokenizer.transform(df).select("vector")
    df = remover.transform(df).select("vector_no_stopw")
    df = (df
        .withColumn("vector_stemmed", stemmer_udf("vector_no_stopw"))
        .select("vector_stemmed")
        )
    
    return df

Example #3

Show file

File: init_dataframes.py Project: mxrty/twitter-sentiment-tool

def init_base_df(file_path=default_file_path):
    # Set legacy parsing as Spark 3.0+ cannot use 'E' for timestamp
    spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")

    print("Loading", default_file_path)

    raw_df = (
        spark.read.format("csv")
        .option("inferSchema", True)
        .load(file_path)
        .toDF("polarity", "tweet_id", "datetime", "query", "user", "text")
    )

    # Parse string to timestamp
    time_parsed_df = raw_df.withColumn(
        "timestamp", to_timestamp("datetime", "EEE MMM dd HH:mm:ss zzz yyyy")
    )

    df = time_parsed_df.drop("query").drop("datetime")

    # Shift polarity from a range of [0:4], to [-1:1]
    scaled_polarity_df = df.withColumn("sentiment", (col("polarity") / 2) - 1).drop(
        "polarity"
    )

    clean_text_df = df.select(clean_text(col("text")).alias("text"), "tweet_id")

    tokenizer = Tokenizer(inputCol="text", outputCol="vector")
    vector_df = tokenizer.transform(clean_text_df).select("vector", "tweet_id")

    remover = StopWordsRemover()
    stopwords = remover.getStopWords()

    remover.setInputCol("vector")
    remover.setOutputCol("tokens")

    tokens_no_stopw_df = remover.transform(vector_df).select("tokens", "tweet_id")

    tweets_with_tokens_df = scaled_polarity_df.join(tokens_no_stopw_df, on=["tweet_id"])

    return tweets_with_tokens_df

Example #4

Show file

File: spark_etl_batch.py Project: smileykaur/Insight_Data_Engineering_Project

def spark_transformation_comments(filename_read_S3, filename_write_elastic,
                                  filename_write_S3):
    """
    Columns in Input:
    'archived', 'author', 'author_flair_css_class', 'author_flair_text', 'body', 'controversiality', 'created_utc',
     'distinguished', 'downs', 'edited', 'gilded', 'id', 'link_id', 'name', 'parent_id', 'retrieved_on', 'score',
     'score_hidden', 'subreddit', 'subreddit_id', 'ups'

    :param filename_read_S3: File to read from
    :param filename_write_elastic: Output file for Elastic
    :param filename_write_S3: Cleaned files to S3
    :return:
    """

    # ---------------------------------------------
    # -------- BASIC TRANSFORMATIONS --------
    # ----------------------------------------------
    logger.info("Stage 1: read file into Dataframe from S3")
    comments_df1 = sqlContext.read.parquet(filename_read_S3)

    columns = comments_df1.columns
    logger.info("List of columns for Comments - {0}".format(columns))

    logger.info("Stage 2: select required columns from data")
    # NOTE: Column "Downs", 'name' isn't available for all years
    # Its available only from 2006-06
    if 'downs' in columns and 'name' in columns:
        comments_df2 = comments_df1.select('subreddit', 'subreddit_id',
                                           'created_utc', 'author', 'id',
                                           'link_id', 'parent_id', 'body',
                                           'controversiality', 'distinguished',
                                           'gilded', 'score', 'ups', 'downs',
                                           'name')
    else:
        comments_df2 = comments_df1.select('subreddit', 'subreddit_id',
                                           'created_utc', 'author', 'id',
                                           'link_id', 'parent_id', 'body',
                                           'controversiality', 'distinguished',
                                           'gilded', 'score', 'ups')

    logger.info(
        "Stage 3: Removing rows where post has been deleted , #TODO: Need to include removed"
    )
    comments_df3 = comments_df2.filter(comments_df2['author'] != '[deleted]')

    # Create and Register trim_link as UDF
    spark.udf.register("trimlinks", trim_link, StringType())
    trim_link_udf = udf(trim_link)

    logger.info("Stage 4: get submission_id from link_id")
    comments_df4 = comments_df3.withColumn("submission_id",
                                           trim_link_udf(col("link_id")))

    # ---------------------------------------------
    # -------- FEATURE ENGINEERING ---------------
    # ---------------------------------------------
    logger.info("Stage 5: Convert 'created_uct' to unix timestamp")
    comments_df5 = comments_df4.select(
        'subreddit', 'subreddit_id', 'author', 'id', 'parent_id', 'body',
        'controversiality', 'distinguished', 'gilded', 'score', 'ups', 'downs',
        'name', 'submission_id',
        from_unixtime('created_utc').alias('timestamp'))

    logger.info(
        "Stage 6: Add new features: Year, Month, day, hour, minute, week, julian day"
    )
    comments_df6 = comments_df5.select(
        'subreddit', 'subreddit_id', 'author', 'id', 'parent_id', 'body',
        'controversiality', 'distinguished', 'gilded', 'score', 'ups', 'downs',
        'name', 'submission_id',
        year(comments_df5.timestamp).alias('year'),
        month(comments_df5.timestamp).alias('month'),
        dayofmonth(comments_df5.timestamp).alias('day'),
        dayofyear(comments_df5.timestamp).alias('day_of_year'),
        hour(comments_df5.timestamp).alias('hour'),
        minute(comments_df5.timestamp).alias('min'),
        weekofyear(comments_df5.timestamp).alias('week_of_year'))

    # ---------------------------------------
    #   PERSIST Data for following reasons:
    # ---------------------------------------
    # 1. Write data to ElasticSearch after ETL
    # 2. Perform NLP based data cleaning for comments
    # 3. Identify popular words
    # 4. Load NLP cleaned data to S3
    # 5. Load Words to ElasticSearch
    comments_df6.persist(StorageLevel.MEMORY_AND_DISK_SER)
    logger.info("persisted data after initial cleaning")

    # -------------------------------------------
    # Write to ElasticSearch: NDJSON file
    # -------------------------------------------
    # Load the Cleaned data to ElasticSearch
    logger.info("starting transforming data to NDJSON - for Large ES load")
    nd_json = comments_df6.rdd.map(lambda x: elastic_search_mapper_body(x))
    logger.info("completed transformation to NDJSON")

    logger.info("save data as Text file")
    if not os.path.exists(filename_write_elastic):
        nd_json.saveAsTextFile(filename_write_elastic)
        ES_WRITE_STATUS = True
    else:
        logger.info("data already loaded")

    # ----------------------------
    # NLP transformations Pipeline
    # -----------------------------
    logger.info("Stage 7: Remove Punctuations")
    comments_df7 = comments_df6.select(
        'subreddit', 'subreddit_id', 'author', 'id', 'parent_id', 'body',
        'controversiality', 'distinguished', 'gilded', 'score', 'ups', 'downs',
        'name', 'submission_id', 'year', 'month', 'day', 'day_of_year', 'hour',
        'min', 'week_of_year', removePunctuation(col('body')))

    logger.info("stage 8: Word Tokenization")
    tokenizer = Tokenizer(inputCol="cleaned_body", outputCol="tokenized_body")
    comments_df8 = tokenizer.transform(comments_df7).select(
        'subreddit', 'subreddit_id', 'author', 'id', 'parent_id', 'body',
        'controversiality', 'distinguished', 'gilded', 'score', 'ups', 'downs',
        'name', 'submission_id', 'year', 'month', 'day', 'day_of_year', 'hour',
        'min', 'week_of_year', 'tokenized_body')

    # StopWords Removal
    logger.info("Stage 9: Using SPARK default stopwords.")
    remover = StopWordsRemover()
    remover.setInputCol("tokenized_body")
    remover.setOutputCol("no_stop_words_body")
    comments_df9 = remover.transform(comments_df8).select(
        'subreddit', 'subreddit_id', 'author', 'id', 'parent_id', 'body',
        'controversiality', 'distinguished', 'gilded', 'score', 'ups', 'downs',
        'name', 'submission_id', 'year', 'month', 'day', 'day_of_year', 'hour',
        'min', 'week_of_year', 'no_stop_words_body')
    logger.info("Stage 10: Making a Custom list of words")
    # TODO: Get Reddit frequent words
    spark.udf.register("filterExtraStopWords", filter_stop_words,
                       ArrayType(StringType()))
    filter_stop_words_udf = udf(filter_stop_words)
    comments_df10 = comments_df9.select(
        'subreddit', 'subreddit_id', 'author', 'id', 'parent_id', 'body',
        'controversiality', 'distinguished', 'gilded', 'score', 'ups', 'downs',
        'name', 'submission_id', 'year', 'month', 'day', 'day_of_year', 'hour',
        'min', 'week_of_year',
        filter_stop_words_udf("no_stop_words_body").alias(
            "body_without_stopwords"))

    logger.info(comments_df10.logger.infoSchema())
    # -------------------------
    # Upload Cleaned data to S3
    # -------------------------
    comments_df10.write.parquet(filename_write_S3)
    logger.info("completed loading the data to S3")

    return

Example #5

Show file

vector_df.show(10)

"""**3. Remove** **stop words**"""

from pyspark.ml.feature import StopWordsRemover

# Define a list of stop words or use default list
remover = StopWordsRemover()
stopwords = remover.getStopWords()

# Display default list
stopwords[:10]

# Specify input/output columns
remover.setInputCol("vector")
remover.setOutputCol("Body_no_stopw")

# Transform existing dataframe with the StopWordsRemover
Body_no_stopw_df = remover.transform(vector_df).select("Body_no_stopw")

# Display
Body_no_stopw_df.printSchema()
Body_no_stopw_df.show()

"""**4. Tokenizing posts into words**"""

# Import stemmer library
from nltk.stem.porter import *

# Instantiate stemmer object
stemmer = PorterStemmer()

Example #6

Show file

File: pyspark_version.py Project: LevanaRu/Search-Engine-Covid-Papers

    def import_data(self):

        # meta df
        meta_df = pd.read_csv(self.metadata_path, dtype={
            'pubmed_id': str,
            'Microsoft Academic Paper ID': str,
            'doi': str
        })

        # json
        all_json = glob.glob(f"{self. DEFAULT_INPUT_PATH}/**/*.json", recursive=True)

        dict_ = {'paper_id': [], 'doi': [], 'abstract': [], 'body_text': [], 'authors': [], 'title': [], 'journal': [],
                 'abstract_summary': []}
        for idx, entry in enumerate(all_json):
            if idx % (len(all_json) // 10) == 0:
                print(f'Processing index: {idx} of {len(all_json)}')

            try:
                content = FileReader(entry)
            except Exception as e:
                continue  # invalid paper format, skip

            # get metadata information
            meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
            # no metadata, skip this paper
            if len(meta_data) == 0:
                continue

            dict_['abstract'].append(content.abstract)
            dict_['paper_id'].append(content.paper_id)
            dict_['body_text'].append(content.body_text)

            # also create a column for the summary of abstract to be used in a plot
            if len(content.abstract) == 0:
                # no abstract provided
                dict_['abstract_summary'].append("Not provided.")
            elif len(content.abstract.split(' ')) > 100:
                # abstract provided is too long for plot, take first 100 words append with ...
                info = content.abstract.split(' ')[:100]
                summary = self.get_breaks(' '.join(info), 40)
                dict_['abstract_summary'].append(summary + "...")
            else:
                # abstract is short enough
                summary = self.get_breaks(content.abstract, 40)
                dict_['abstract_summary'].append(summary)

            # get metadata information
            meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]

            try:
                # if more than one author
                authors = meta_data['authors'].values[0].split(';')
                if len(authors) > 2:
                    # if more than 2 authors, take them all with html tag breaks in between
                    dict_['authors'].append(self.get_breaks('. '.join(authors), 40))
                else:
                    # authors will fit in plot
                    dict_['authors'].append(". ".join(authors))
            except Exception as e:
                # if only one author - or Null valie
                dict_['authors'].append(meta_data['authors'].values[0])

            # add the title information, add breaks when needed
            try:
                title = self.get_breaks(meta_data['title'].values[0], 40)
                dict_['title'].append(title)
            # if title was not provided
            except Exception as e:
                dict_['title'].append(meta_data['title'].values[0])

            # add the journal information
            dict_['journal'].append(meta_data['journal'].values[0])

            # add doi
            dict_['doi'].append(meta_data['doi'].values[0])

        df_covid = pd.DataFrame(dict_,
                                columns=['paper_id', 'doi', 'abstract', 'body_text', 'authors', 'title', 'journal',
                                         'abstract_summary'])
        df_covid['abstract_word_count'] = df_covid['abstract'].apply(
            lambda x: len(x.strip().split()))  # word count in abstract
        df_covid['body_word_count'] = df_covid['body_text'].apply(
            lambda x: len(x.strip().split()))  # word count in body
        df_covid['body_unique_words'] = df_covid['body_text'].apply(
            lambda x: len(set(str(x).split())))  # number of unique words in body

        # remove duplicates
        df_covid.drop_duplicates(['abstract', 'body_text'], inplace=True)
        df_covid['abstract'].describe(include='all')
        df_covid.dropna(inplace=True)

        # handle multiple languages
        # set seed
        DetectorFactory.seed = 0

        # hold label - language
        languages = []

        # go through each text
        for ii in tqdm(range(0, len(df_covid))):
            # split by space into list, take the first x intex, join with space
            text = df_covid.iloc[ii]['body_text'].split(" ")

            lang = "en"
            try:
                if len(text) > 50:
                    lang = detect(" ".join(text[:50]))
                elif len(text) > 0:
                    lang = detect(" ".join(text[:len(text)]))
            # ught... beginning of the document was not in a good format
            except Exception as e:
                all_words = set(text)
                try:
                    lang = detect(" ".join(all_words))
                # what!! :( let's see if we can find any text in abstract...
                except Exception as e:

                    try:
                        # let's try to label it through the abstract then
                        lang = detect(df_covid.iloc[ii]['abstract_summary'])
                    except Exception as e:
                        lang = "unknown"
                        pass

            # get the language
            languages.append(lang)

        languages_dict = {}
        for lang in set(languages):
            languages_dict[lang] = languages.count(lang)

        df_covid['language'] = languages
        # drop
        df_covid = df_covid[df_covid['language']=='en']


        # change to spark
        # Enable Arrow-based columnar data transfers
        spark = SparkSession \
            .builder \
            .appName("PySparkKMeans") \
            .config("spark.some.config.option", "some-value") \
            .getOrCreate()
        spark.conf.set("spark.sql.execution.arrow.enabled", "true")

        # Create a Spark DataFrame from a pandas DataFrame using Arrow
        df_english = spark.createDataFrame(df_covid)
        clean_text_df = df_english.withColumn("text", self.clean_text(col("body_text")))

        tokenizer = Tokenizer(inputCol="text", outputCol="vector")
        vector_df = tokenizer.transform(clean_text_df)


        # remove stopwords
        punctuations = string.punctuation
        stopwords = list(STOP_WORDS)
        stopwords[:10]

        custom_stop_words = [
            'doi', 'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et', 'al', 'author', 'figure',
            'rights', 'reserved', 'permission', 'used', 'using', 'biorxiv', 'medrxiv', 'license', 'fig', 'fig.',
            'al.', 'elsevier', 'pmc', 'czi', 'www', "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
            "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"
        ]

        for w in custom_stop_words:
            if w not in stopwords:
                stopwords.append(w)

        # Define a list of stop words or use default list
        remover = StopWordsRemover(stopWords=stopwords)

        # Specify input/output columns
        remover.setInputCol("vector")
        remover.setOutputCol("vector_no_stopw")

        # Transform existing dataframe with the StopWordsRemover
        vector_no_stopw_df = remover.transform(vector_df)



        # tdidf
        hashingTF = HashingTF()
        tf = hashingTF.transform(vector_no_stopw_df.select("vector_no_stopw"))

        # While applying HashingTF only needs a single pass to the data, applying IDF needs two passes:
        # First to compute the IDF vector and second to scale the term frequencies by IDF.
        tf.cache()
        idf = IDF().fit(tf)
        tfidf = idf.transform(tf)

        # PCA
        mat = RowMatrix(tfidf)
        # Compute the top 4 principal components.
        # Principal components are stored in a local dense matrix.
        pc = mat.computePrincipalComponents(1325)

        # Project the rows to the linear space spanned by the top 4 principal components.
        projected = mat.multiply(pc)
        projected.toPandas().to_csv(f"{self.DEFAULT_OUTPUT_FILE}")

        return projected