Example #1
0
def task_four(ngram):
    """
    Set the ngram value
    :param ngram:
    :return:
    """
    params = list(inspect.getargspec(task_four))
    p = list(chain.from_iterable([i for i in params if i is not None]))
    param_values = {}
    if len(p) > 0:
        for i, v in enumerate(p):
            try:
                value = raw_input("Please enter a value for {} ==> ".format(v))
                param_values.update({v: value})
            except:
                pass
    ngram = param_values.get(p[0])

    if int(ngram) == 2:
        # --- list of stopwords
        stopwords = {
            'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you',
            'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his',
            'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself',
            'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which',
            'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are',
            'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had',
            'having', 'do', 'does', 'did', 'doing', 'an', 'the', 'and', 'but',
            'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by',
            'for', 'with', 'about', 'against', 'between', 'into', 'through',
            'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up',
            'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again',
            'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why',
            'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most',
            'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same',
            'so', 'than', 'too', 'very', 'can', 'will', 'just', 'don',
            'should', 'now', ' a ', 'insured', 'sured', 'coverage', 'year',
            'dob', 'insd', 'left'
        }

        # --- remove stop words
        REMOVER = StopWordsRemover()
        stopwords = REMOVER.getStopWords()
        REMOVER.setInputCol("inter_wordlist")
        REMOVER.setOutputCol("inter_wordlist_two")

        stpwrds_rmvd_sdf = REMOVER.transform(VECTOR_DATAFRAME) \
                                    .select(["Claim_Id", "filename", "inter_wordlist_two"])

    else:
        pass
Example #2
0
def tokenize_df(df): 
    
    tokenizer = Tokenizer(inputCol="text", outputCol="vector")
    remover = StopWordsRemover()
    remover.setInputCol("vector")
    remover.setOutputCol("vector_no_stopw")
    stopwords = remover.getStopWords()
    stemmer = PorterStemmer()
    stemmer_udf = udf(lambda x: stem(x), ArrayType(StringType()))

    df = df.select(clean_text(col("text")).alias("text"))
    df = tokenizer.transform(df).select("vector")
    df = remover.transform(df).select("vector_no_stopw")
    df = (df
        .withColumn("vector_stemmed", stemmer_udf("vector_no_stopw"))
        .select("vector_stemmed")
        )
    
    return df
def init_base_df(file_path=default_file_path):
    # Set legacy parsing as Spark 3.0+ cannot use 'E' for timestamp
    spark.sql("set spark.sql.legacy.timeParserPolicy=LEGACY")

    print("Loading", default_file_path)

    raw_df = (
        spark.read.format("csv")
        .option("inferSchema", True)
        .load(file_path)
        .toDF("polarity", "tweet_id", "datetime", "query", "user", "text")
    )

    # Parse string to timestamp
    time_parsed_df = raw_df.withColumn(
        "timestamp", to_timestamp("datetime", "EEE MMM dd HH:mm:ss zzz yyyy")
    )

    df = time_parsed_df.drop("query").drop("datetime")

    # Shift polarity from a range of [0:4], to [-1:1]
    scaled_polarity_df = df.withColumn("sentiment", (col("polarity") / 2) - 1).drop(
        "polarity"
    )

    clean_text_df = df.select(clean_text(col("text")).alias("text"), "tweet_id")

    tokenizer = Tokenizer(inputCol="text", outputCol="vector")
    vector_df = tokenizer.transform(clean_text_df).select("vector", "tweet_id")

    remover = StopWordsRemover()
    stopwords = remover.getStopWords()

    remover.setInputCol("vector")
    remover.setOutputCol("tokens")

    tokens_no_stopw_df = remover.transform(vector_df).select("tokens", "tweet_id")

    tweets_with_tokens_df = scaled_polarity_df.join(tokens_no_stopw_df, on=["tweet_id"])

    return tweets_with_tokens_df
def spark_transformation_comments(filename_read_S3, filename_write_elastic,
                                  filename_write_S3):
    """
    Columns in Input:
    'archived', 'author', 'author_flair_css_class', 'author_flair_text', 'body', 'controversiality', 'created_utc',
     'distinguished', 'downs', 'edited', 'gilded', 'id', 'link_id', 'name', 'parent_id', 'retrieved_on', 'score',
     'score_hidden', 'subreddit', 'subreddit_id', 'ups'

    :param filename_read_S3: File to read from
    :param filename_write_elastic: Output file for Elastic
    :param filename_write_S3: Cleaned files to S3
    :return:
    """

    # ---------------------------------------------
    # -------- BASIC TRANSFORMATIONS --------
    # ----------------------------------------------
    logger.info("Stage 1: read file into Dataframe from S3")
    comments_df1 = sqlContext.read.parquet(filename_read_S3)

    columns = comments_df1.columns
    logger.info("List of columns for Comments - {0}".format(columns))

    logger.info("Stage 2: select required columns from data")
    # NOTE: Column "Downs", 'name' isn't available for all years
    # Its available only from 2006-06
    if 'downs' in columns and 'name' in columns:
        comments_df2 = comments_df1.select('subreddit', 'subreddit_id',
                                           'created_utc', 'author', 'id',
                                           'link_id', 'parent_id', 'body',
                                           'controversiality', 'distinguished',
                                           'gilded', 'score', 'ups', 'downs',
                                           'name')
    else:
        comments_df2 = comments_df1.select('subreddit', 'subreddit_id',
                                           'created_utc', 'author', 'id',
                                           'link_id', 'parent_id', 'body',
                                           'controversiality', 'distinguished',
                                           'gilded', 'score', 'ups')

    logger.info(
        "Stage 3: Removing rows where post has been deleted , #TODO: Need to include removed"
    )
    comments_df3 = comments_df2.filter(comments_df2['author'] != '[deleted]')

    # Create and Register trim_link as UDF
    spark.udf.register("trimlinks", trim_link, StringType())
    trim_link_udf = udf(trim_link)

    logger.info("Stage 4: get submission_id from link_id")
    comments_df4 = comments_df3.withColumn("submission_id",
                                           trim_link_udf(col("link_id")))

    # ---------------------------------------------
    # -------- FEATURE ENGINEERING ---------------
    # ---------------------------------------------
    logger.info("Stage 5: Convert 'created_uct' to unix timestamp")
    comments_df5 = comments_df4.select(
        'subreddit', 'subreddit_id', 'author', 'id', 'parent_id', 'body',
        'controversiality', 'distinguished', 'gilded', 'score', 'ups', 'downs',
        'name', 'submission_id',
        from_unixtime('created_utc').alias('timestamp'))

    logger.info(
        "Stage 6: Add new features: Year, Month, day, hour, minute, week, julian day"
    )
    comments_df6 = comments_df5.select(
        'subreddit', 'subreddit_id', 'author', 'id', 'parent_id', 'body',
        'controversiality', 'distinguished', 'gilded', 'score', 'ups', 'downs',
        'name', 'submission_id',
        year(comments_df5.timestamp).alias('year'),
        month(comments_df5.timestamp).alias('month'),
        dayofmonth(comments_df5.timestamp).alias('day'),
        dayofyear(comments_df5.timestamp).alias('day_of_year'),
        hour(comments_df5.timestamp).alias('hour'),
        minute(comments_df5.timestamp).alias('min'),
        weekofyear(comments_df5.timestamp).alias('week_of_year'))

    # ---------------------------------------
    #   PERSIST Data for following reasons:
    # ---------------------------------------
    # 1. Write data to ElasticSearch after ETL
    # 2. Perform NLP based data cleaning for comments
    # 3. Identify popular words
    # 4. Load NLP cleaned data to S3
    # 5. Load Words to ElasticSearch
    comments_df6.persist(StorageLevel.MEMORY_AND_DISK_SER)
    logger.info("persisted data after initial cleaning")

    # -------------------------------------------
    # Write to ElasticSearch: NDJSON file
    # -------------------------------------------
    # Load the Cleaned data to ElasticSearch
    logger.info("starting transforming data to NDJSON - for Large ES load")
    nd_json = comments_df6.rdd.map(lambda x: elastic_search_mapper_body(x))
    logger.info("completed transformation to NDJSON")

    logger.info("save data as Text file")
    if not os.path.exists(filename_write_elastic):
        nd_json.saveAsTextFile(filename_write_elastic)
        ES_WRITE_STATUS = True
    else:
        logger.info("data already loaded")

    # ----------------------------
    # NLP transformations Pipeline
    # -----------------------------
    logger.info("Stage 7: Remove Punctuations")
    comments_df7 = comments_df6.select(
        'subreddit', 'subreddit_id', 'author', 'id', 'parent_id', 'body',
        'controversiality', 'distinguished', 'gilded', 'score', 'ups', 'downs',
        'name', 'submission_id', 'year', 'month', 'day', 'day_of_year', 'hour',
        'min', 'week_of_year', removePunctuation(col('body')))

    logger.info("stage 8: Word Tokenization")
    tokenizer = Tokenizer(inputCol="cleaned_body", outputCol="tokenized_body")
    comments_df8 = tokenizer.transform(comments_df7).select(
        'subreddit', 'subreddit_id', 'author', 'id', 'parent_id', 'body',
        'controversiality', 'distinguished', 'gilded', 'score', 'ups', 'downs',
        'name', 'submission_id', 'year', 'month', 'day', 'day_of_year', 'hour',
        'min', 'week_of_year', 'tokenized_body')

    # StopWords Removal
    logger.info("Stage 9: Using SPARK default stopwords.")
    remover = StopWordsRemover()
    remover.setInputCol("tokenized_body")
    remover.setOutputCol("no_stop_words_body")
    comments_df9 = remover.transform(comments_df8).select(
        'subreddit', 'subreddit_id', 'author', 'id', 'parent_id', 'body',
        'controversiality', 'distinguished', 'gilded', 'score', 'ups', 'downs',
        'name', 'submission_id', 'year', 'month', 'day', 'day_of_year', 'hour',
        'min', 'week_of_year', 'no_stop_words_body')
    logger.info("Stage 10: Making a Custom list of words")
    # TODO: Get Reddit frequent words
    spark.udf.register("filterExtraStopWords", filter_stop_words,
                       ArrayType(StringType()))
    filter_stop_words_udf = udf(filter_stop_words)
    comments_df10 = comments_df9.select(
        'subreddit', 'subreddit_id', 'author', 'id', 'parent_id', 'body',
        'controversiality', 'distinguished', 'gilded', 'score', 'ups', 'downs',
        'name', 'submission_id', 'year', 'month', 'day', 'day_of_year', 'hour',
        'min', 'week_of_year',
        filter_stop_words_udf("no_stop_words_body").alias(
            "body_without_stopwords"))

    logger.info(comments_df10.logger.infoSchema())
    # -------------------------
    # Upload Cleaned data to S3
    # -------------------------
    comments_df10.write.parquet(filename_write_S3)
    logger.info("completed loading the data to S3")

    return
Example #5
0
vector_df.show(10)

"""**3. Remove** **stop words**"""

from pyspark.ml.feature import StopWordsRemover

# Define a list of stop words or use default list
remover = StopWordsRemover()
stopwords = remover.getStopWords()

# Display default list
stopwords[:10]

# Specify input/output columns
remover.setInputCol("vector")
remover.setOutputCol("Body_no_stopw")

# Transform existing dataframe with the StopWordsRemover
Body_no_stopw_df = remover.transform(vector_df).select("Body_no_stopw")

# Display
Body_no_stopw_df.printSchema()
Body_no_stopw_df.show()

"""**4. Tokenizing posts into words**"""

# Import stemmer library
from nltk.stem.porter import *

# Instantiate stemmer object
stemmer = PorterStemmer()
    def import_data(self):

        # meta df
        meta_df = pd.read_csv(self.metadata_path, dtype={
            'pubmed_id': str,
            'Microsoft Academic Paper ID': str,
            'doi': str
        })

        # json
        all_json = glob.glob(f"{self. DEFAULT_INPUT_PATH}/**/*.json", recursive=True)

        dict_ = {'paper_id': [], 'doi': [], 'abstract': [], 'body_text': [], 'authors': [], 'title': [], 'journal': [],
                 'abstract_summary': []}
        for idx, entry in enumerate(all_json):
            if idx % (len(all_json) // 10) == 0:
                print(f'Processing index: {idx} of {len(all_json)}')

            try:
                content = FileReader(entry)
            except Exception as e:
                continue  # invalid paper format, skip

            # get metadata information
            meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]
            # no metadata, skip this paper
            if len(meta_data) == 0:
                continue

            dict_['abstract'].append(content.abstract)
            dict_['paper_id'].append(content.paper_id)
            dict_['body_text'].append(content.body_text)

            # also create a column for the summary of abstract to be used in a plot
            if len(content.abstract) == 0:
                # no abstract provided
                dict_['abstract_summary'].append("Not provided.")
            elif len(content.abstract.split(' ')) > 100:
                # abstract provided is too long for plot, take first 100 words append with ...
                info = content.abstract.split(' ')[:100]
                summary = self.get_breaks(' '.join(info), 40)
                dict_['abstract_summary'].append(summary + "...")
            else:
                # abstract is short enough
                summary = self.get_breaks(content.abstract, 40)
                dict_['abstract_summary'].append(summary)

            # get metadata information
            meta_data = meta_df.loc[meta_df['sha'] == content.paper_id]

            try:
                # if more than one author
                authors = meta_data['authors'].values[0].split(';')
                if len(authors) > 2:
                    # if more than 2 authors, take them all with html tag breaks in between
                    dict_['authors'].append(self.get_breaks('. '.join(authors), 40))
                else:
                    # authors will fit in plot
                    dict_['authors'].append(". ".join(authors))
            except Exception as e:
                # if only one author - or Null valie
                dict_['authors'].append(meta_data['authors'].values[0])

            # add the title information, add breaks when needed
            try:
                title = self.get_breaks(meta_data['title'].values[0], 40)
                dict_['title'].append(title)
            # if title was not provided
            except Exception as e:
                dict_['title'].append(meta_data['title'].values[0])

            # add the journal information
            dict_['journal'].append(meta_data['journal'].values[0])

            # add doi
            dict_['doi'].append(meta_data['doi'].values[0])

        df_covid = pd.DataFrame(dict_,
                                columns=['paper_id', 'doi', 'abstract', 'body_text', 'authors', 'title', 'journal',
                                         'abstract_summary'])
        df_covid['abstract_word_count'] = df_covid['abstract'].apply(
            lambda x: len(x.strip().split()))  # word count in abstract
        df_covid['body_word_count'] = df_covid['body_text'].apply(
            lambda x: len(x.strip().split()))  # word count in body
        df_covid['body_unique_words'] = df_covid['body_text'].apply(
            lambda x: len(set(str(x).split())))  # number of unique words in body

        # remove duplicates
        df_covid.drop_duplicates(['abstract', 'body_text'], inplace=True)
        df_covid['abstract'].describe(include='all')
        df_covid.dropna(inplace=True)

        # handle multiple languages
        # set seed
        DetectorFactory.seed = 0

        # hold label - language
        languages = []

        # go through each text
        for ii in tqdm(range(0, len(df_covid))):
            # split by space into list, take the first x intex, join with space
            text = df_covid.iloc[ii]['body_text'].split(" ")

            lang = "en"
            try:
                if len(text) > 50:
                    lang = detect(" ".join(text[:50]))
                elif len(text) > 0:
                    lang = detect(" ".join(text[:len(text)]))
            # ught... beginning of the document was not in a good format
            except Exception as e:
                all_words = set(text)
                try:
                    lang = detect(" ".join(all_words))
                # what!! :( let's see if we can find any text in abstract...
                except Exception as e:

                    try:
                        # let's try to label it through the abstract then
                        lang = detect(df_covid.iloc[ii]['abstract_summary'])
                    except Exception as e:
                        lang = "unknown"
                        pass

            # get the language
            languages.append(lang)

        languages_dict = {}
        for lang in set(languages):
            languages_dict[lang] = languages.count(lang)

        df_covid['language'] = languages
        # drop
        df_covid = df_covid[df_covid['language']=='en']


        # change to spark
        # Enable Arrow-based columnar data transfers
        spark = SparkSession \
            .builder \
            .appName("PySparkKMeans") \
            .config("spark.some.config.option", "some-value") \
            .getOrCreate()
        spark.conf.set("spark.sql.execution.arrow.enabled", "true")

        # Create a Spark DataFrame from a pandas DataFrame using Arrow
        df_english = spark.createDataFrame(df_covid)
        clean_text_df = df_english.withColumn("text", self.clean_text(col("body_text")))

        tokenizer = Tokenizer(inputCol="text", outputCol="vector")
        vector_df = tokenizer.transform(clean_text_df)


        # remove stopwords
        punctuations = string.punctuation
        stopwords = list(STOP_WORDS)
        stopwords[:10]

        custom_stop_words = [
            'doi', 'preprint', 'copyright', 'peer', 'reviewed', 'org', 'https', 'et', 'al', 'author', 'figure',
            'rights', 'reserved', 'permission', 'used', 'using', 'biorxiv', 'medrxiv', 'license', 'fig', 'fig.',
            'al.', 'elsevier', 'pmc', 'czi', 'www', "a", "b", "c", "d", "e", "f", "g", "h", "i", "j", "k", "l", "m",
            "n", "o", "p", "q", "r", "s", "t", "u", "v", "w", "x", "y", "z"
        ]

        for w in custom_stop_words:
            if w not in stopwords:
                stopwords.append(w)

        # Define a list of stop words or use default list
        remover = StopWordsRemover(stopWords=stopwords)

        # Specify input/output columns
        remover.setInputCol("vector")
        remover.setOutputCol("vector_no_stopw")

        # Transform existing dataframe with the StopWordsRemover
        vector_no_stopw_df = remover.transform(vector_df)



        # tdidf
        hashingTF = HashingTF()
        tf = hashingTF.transform(vector_no_stopw_df.select("vector_no_stopw"))

        # While applying HashingTF only needs a single pass to the data, applying IDF needs two passes:
        # First to compute the IDF vector and second to scale the term frequencies by IDF.
        tf.cache()
        idf = IDF().fit(tf)
        tfidf = idf.transform(tf)

        # PCA
        mat = RowMatrix(tfidf)
        # Compute the top 4 principal components.
        # Principal components are stored in a local dense matrix.
        pc = mat.computePrincipalComponents(1325)

        # Project the rows to the linear space spanned by the top 4 principal components.
        projected = mat.multiply(pc)
        projected.toPandas().to_csv(f"{self.DEFAULT_OUTPUT_FILE}")

        return projected