Beispiel #1
0
plt.show()

y1Val.sort()
y2Val.sort()

plt.plot(x, y1Val, 'go--', linewidth=2, markersize=0, label='Non-Spoiler')
plt.plot(x, y2Val, 'ro-', linewidth=2, markersize=0, label='Spoiler')
plt.ylabel('Average Sentence Length')
plt.legend(loc='upper left')
plt.title('Review Sentence Length')
plt.savefig(IMG_PATH + 'lengthSorted.png', format='png', transparent=True)
plt.show()

# %%
stopWords = list(set(nltk.corpus.stopwords.words('english'))) + ['']
tokenizer = Tokenizer(inputCol='review', outputCol='tokens')
stopWordRemover = StopWordsRemover(
    inputCol=tokenizer.getOutputCol(),
    outputCol='stoppedWords').setStopWords(stopWords)
pipeline = Pipeline(stages=[tokenizer, stopWordRemover])
dataSet = pipeline.fit(dataSet).transform(dataSet)

# %%
newLengthDF = dataSet.withColumn('newLength',
                                 F.size(stopWordRemover.getOutputCol()))

# %%
newSentenceLen = newLengthDF.select('class', 'newLength').collect()

#%%
y = [
def preprocess_files(bucket_name, file_name):

    raw_data = sql_context.read.parquet("s3a://{0}/{1}".format(
        bucket_name, file_name))

    #LIMIT TO 10 initially
    unanswered_questions = raw_data.filter(raw_data.PostTypeId == 1).filter(
        raw_data.AcceptedAnswerId.isNull())
    print(unanswered_questions.count())

    # Clean article text
    print(colored("[PROCESSING]: Cleaning post body", "green"))
    clean_body = F.udf(lambda body: filter_body(body), StringType())
    clean_article_data = unanswered_questions.withColumn(
        "cleaned_body", clean_body("Body"))
    # Tokenize article text
    print(colored("[PROCESSING]: Tokenizing text vector...", "green"))
    tokenizer = Tokenizer(inputCol="cleaned_body",
                          outputCol="text_body_tokenized")
    tokenized_data = tokenizer.transform(clean_article_data)
    print("tokenized_data")
    # tokenized_data.show()

    # Remove stop words
    print(colored("[PROCESSING]: Removing stop words", "green"))
    stop_words_remover = StopWordsRemover(
        inputCol="text_body_tokenized",
        outputCol="text_body_stop_words_removed")
    stop_words_removed_data = stop_words_remover.transform(tokenized_data)
    print("stop_words_removed_data")
    # stop_words_removed_data.show()

    # Stem words
    print(colored("Stemming tokenized text", "green"))
    stem = F.udf(lambda tokens: lemmatize(tokens), ArrayType(StringType()))
    stemmed_data = stop_words_removed_data.withColumn(
        "text_body_stemmed", stem("text_body_stop_words_removed"))
    print("stemmed_data")
    # stemmed_data.show()

    # Shingle resulting body
    print(colored("Shingling resulting text", "green"))
    shingle = F.udf(lambda tokens: get_n_gram_shingles(tokens, 3),
                    StringType())
    shingled_data = stemmed_data.withColumn("text_body_shingled",
                                            shingle("text_body_stemmed"))
    shingle_table = shingled_data.select('Id', 'text_body_shingled')
    print(colored("Adding category/id mappings to Redis", "green"))
    print("shingle_table")
    shingle_table.head()

    # Create a mapping of article categories to article id's that fall under that category. Each key is an article category and the values the list of article id's.
    cat_id_map = unanswered_questions.select(
        F.explode('Tags').alias('Tag'), 'Id').groupBy(F.col('Tag')).agg(
            F.collect_list('Id').alias('Ids_list')).where(
                F.size(F.col('Ids_list')) < 200).withColumn(
                    'Ids', to_str_udf('Ids_list'))
    print(colored("Beginning writing category/id mapping to Redis", "green"))

    def write_cat_id_map_to_redis(rdd):
        rdb = redis.StrictRedis(
            host="ec2-52-73-233-196.compute-1.amazonaws.com", port=6379, db=1)
        for row in rdd:
            rdb.sadd('cat:{}'.format(row.Tag), row.Ids)

    cat_id_map.foreachPartition(write_cat_id_map_to_redis)
    print(cat_id_map.show(5, True))
    print(colored("Finished writing category/id mapping to Redis", "green"))

    #Minhash calculations
    k = 100
    random_seed = 50
    masks = (np.random.RandomState(seed=random_seed).randint(
        np.iinfo(np.int64).min,
        np.iinfo(np.int64).max, k))

    def update_min_hash_signature(word, min_hash_signature):
        root_hash = mmh3.hash64(pickle.dumps(word))[0]
        word_hashes = np.bitwise_xor(
            masks, root_hash
        )  # XOR root hash with k randomly generated integers to simulate k hash functions
        min_hash_signature = np.minimum(min_hash_signature, word_hashes)
        return min_hash_signature

    def calc_min_hash_signature(tokens):
        min_hash_signature = np.empty(k, dtype=np.int64)
        min_hash_signature.fill(np.iinfo(np.int64).max)
        for token in tokens:
            min_hash_signature = update_min_hash_signature(
                token, min_hash_signature)
        return min_hash_signature

    def compute_minhash(df):
        calc_min_hash_udf = F.udf(
            lambda x: str(
                list(map(lambda x: int(x), calc_min_hash_signature(x)))),
            StringType())
        df = df.withColumn("min_hash",
                           calc_min_hash_udf("text_body_shingled")).select(
                               'id', 'min_hash')
        return df

    print(colored("Computing minhash values", "green"))
    minhash_df = compute_minhash(shingle_table)
    print(colored("Finished computing minhash values", "green"))
    print(colored("Beginning writing minhash data to Redis", "green"))

    # Write minhash data to redis. If pipeline=True, use pipeline
    # method of inserting data in Redis
    def write_minhash_data_to_redis(rdd):
        rdb = redis.StrictRedis(
            host="ec2-52-73-233-196.compute-1.amazonaws.com", port=6379, db=1)
        for row in rdd:
            rdb.sadd('id:{}'.format(row.id), row.min_hash)

    #print(minhash_df.show(5, True))
    minhash_df.foreachPartition(write_minhash_data_to_redis)

    print(colored("Finished writing minhash data to Redis", "green"))
    old_df = new_df
    p1.close()
	
for f2 in listdir(hampath):
    p2 = open(hampath+f2, 'r')				#create a temporary dataframe and append a label '0.0' to ever ham emai
    temp2 = spark.createDataFrame([(p2.read(), 0.0)], ['text', 'label'])
    new_df = old_df.unionAll(temp2)
    old_df = new_df
    p2.close()

final = old_df

(training, test) = final.randomSplit([0.8, 0.2]) 	#split the dataframe into training and test

# Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
tokenizer = Tokenizer(inputCol="text", outputCol="words")   	#now create a tokenizer for getting word of the email
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")		#create a feature for every word
lr = LogisticRegression(maxIter=10, regParam=0.001)		#make a logistic regression model
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

# Fit the pipeline to training documents.
model = pipeline.fit(training)					#fit the model based on training data

# Make predictions on test documents and print columns of interest.
prediction = model.transform(test)				#make predictions on test data based on model

prediction.show(1)						#show the columns of interest

#Accuracy calculation

evaluator1 = MulticlassClassificationEvaluator(
Beispiel #4
0
mySchema = StructType([ StructField("target", IntegerType(), True)\
                       ,StructField("id", StringType(), True)\
                       ,StructField("date", StringType(), True)\
                       ,StructField("flag", StringType(), True)\
                       ,StructField("user", StringType(), True)\
                       ,StructField("body", StringType(), True)])

df = spark.createDataFrame(data, schema=mySchema)
df.show(5)

# Create training, validation, and test sets
(train_set, val_set, test_set) = df.randomSplit([0.98, 0.01, 0.01], seed=2000)

# Prepare TF-IDF + Logistic Regression Model
tokenizer = Tokenizer(inputCol="body", outputCol="words")
hashtf = HashingTF(numFeatures=2**16, inputCol="words", outputCol='tf')
idf = IDF(inputCol='tf', outputCol="features",
          minDocFreq=5)  #minDocFreq: remove sparse terms
label_stringIdx = StringIndexer(inputCol="target", outputCol="label")
pipeline = Pipeline(stages=[tokenizer, hashtf, idf, label_stringIdx])

pipelineFit = pipeline.fit(train_set)
train_df = pipelineFit.transform(train_set)
val_df = pipelineFit.transform(val_set)
train_df.show(5)

# Train Model
lr = LogisticRegression(maxIter=20)
lrModel = lr.fit(train_df)
predictions = lrModel.transform(val_df)
Beispiel #5
0
from pyspark.sql import SparkSession

spark = SparkSession.builder.appName('nlp').getOrCreate()

from pyspark.ml.feature import HashingTF, IDF, Tokenizer

sentenceData = spark.createDataFrame(
    [(0.0, "Hi I heard about Spark"),
     (0.0, "I wish Java could use case classes"),
     (1.0, "Logistic regression models are neat")], ["label", "sentence"])

sentenceData.show()

tokenizer = Tokenizer(inputCol='sentence', outputCol='words')
words_data = tokenizer.transform(sentenceData)
words_data.show(truncate=False)

hashing_tf = HashingTF(inputCol='words', outputCol='rawFeatures')
featurized_data = hashing_tf.tranfsorm(words_data)
idf = IDF(inputCol='rawFeatures', outputCol='features')
idf_model = idf.fit(featurized_data)
rescaled_data = idf_model.transform(featurized_data)
rescaled_data.select('label', 'features').show(truncate=False)

from pyspark.ml.feature import CountVectorizer

df = spark.createDataFrame([(0, "a,b,c".split(" ")),
                            (1, "a b b c a".split(" "))], ["id", "words"])

df.show()
Beispiel #6
0
def tokenizer(dataset, inputCol):
    from pyspark.ml.feature import Tokenizer
    return Tokenizer(inputCol=inputCol, outputCol=inputCol+'_tkn').transform(dataset)
sc = SparkContext(conf = conf)
spark = SparkSession(sc)
sqlContext = SQLContext(sc)  
df = spark.read.csv('file:////home/ubuntu/ys-180326/Dataset150.csv', header = True)
data = df.rdd.map(list)
print(data.first())

score = data.map(lambda s : 1.0 if s[1].isdigit() and float(s[1])==1.0 else 0.0 )
comment = data.map(lambda s : s[3])
split_neg_data2 = score.zip(comment)
tranform_data =  split_neg_data2.map(lambda p : (p[0],p[1]))#.toDF()#.withColumnRenamed('_1','label')
#tranform_data.show()
#sentenceData = spark.createDataFrame([(0, "I heard about Spark and I love Spark"),(0, "I wish Java could use case classes"),(1, "Logistic regression models are neat")]).toDF("label", "sentence")

sentenceData = spark.createDataFrame(tranform_data,["label", "sentence"])
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
wordsData = tokenizer.transform(sentenceData)

#计算TF-IDF
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=3000)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
rescaledData.select("label", "features").show()
forData = StringIndexer().setInputCol("label").setOutputCol("indexed").fit(rescaledData).transform(rescaledData)
(trainingData, testData) = forData.randomSplit([0.8,0.2],seed=0)
print(trainingData.take(1))

nb = NaiveBayes(smoothing=1.0, modelType="multinomial",labelCol="indexed")
start_time = time.time()
Beispiel #8
0
spark = SparkSession.builder.appName("TfIdf Example").getOrCreate()

documentData = spark.createDataFrame([
        (0.0, Doc1),
        (0.1, Doc2),
        (0.2, Doc3),
        (0.3, Doc4),
        (0.5, Doc5)
    ], ["label", "document"])

#Printing the data
documentData.show()

#Performing Tokenization for the data
tokenizer = Tokenizer(inputCol="document", outputCol="words")
wordsData = tokenizer.transform(documentData)
wordsData.show()

"""# **2.a) Performing the task without NLP**"""

# applying tf on the words data
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=200)
tf = hashingTF.transform(wordsData)
# alternatively, CountVectorizer can also be used to get term frequency vectors
# calculating the IDF
tf.cache()
idf = IDF(inputCol="rawFeatures", outputCol="features")
idf = idf.fit(tf)
tfidf = idf.transform(tf)
#displaying the results
Beispiel #9
0
                             F.to_date('PublishDate', "yyyy-MM-dd HH:mm:ss"))
df_timestamped = df_news.select(['PublishDate1', 'Topic', 'Title', 'Headline'])

# drop duplicates
#df_timestamped = df_timestamped.dropDuplicates(['Title', 'Headline'])

# remove punctuation from text data, text to lower case and trim whitespaces
df_timestamped = df_timestamped.withColumn(
    'Title',
    F.trim(F.lower(F.regexp_replace(F.col('Title'), '[^\sa-zA-Z0-9]', ''))))
df_timestamped = df_timestamped.withColumn(
    'Headline',
    F.trim(F.lower(F.regexp_replace(F.col('Headline'), '[^\sa-zA-Z0-9]', ''))))

# tokenize titles and headlines
title_tokenizer = Tokenizer(inputCol='Title', outputCol='Title_words')
headline_tokenizer = Tokenizer(inputCol='Headline', outputCol='Headline_words')
df_timestamped = title_tokenizer.transform(df_timestamped)
df_timestamped = headline_tokenizer.transform(df_timestamped)

# remove stop words
titel_remover = StopWordsRemover(inputCol='Title_words',
                                 outputCol='Title_final')
headline_remover = StopWordsRemover(inputCol='Headline_words',
                                    outputCol='Headline_final')
df_timestamped = titel_remover.transform(df_timestamped)
df_timestamped = headline_remover.transform(df_timestamped)

# simplify dataframe
df_timestamped = df_timestamped.select(
    F.col('PublishDate1').alias('PublishDate'), F.col('Topic'),
Beispiel #10
0
def main(input_dir, output_dir):
    # main logic starts here
    df_schema = types.StructType([
        types.StructField('title_clean', types.StringType()),
        types.StructField('polarity_subjectivity',
                          types.ArrayType(types.FloatType())),
        types.StructField('score', types.LongType()),
        types.StructField('num_comments', types.LongType()),
    ])

    headlines_df = spark.read.json(input_dir,
                                   encoding='utf-8',
                                   schema=df_schema).repartition(80)
    split_sentiment_df = headlines_df.withColumn(
        'polarity',
        functions.element_at(headlines_df['polarity_subjectivity'],
                             1)).withColumn(
                                 'subjectivity',
                                 functions.element_at(
                                     headlines_df['polarity_subjectivity'], 2))

    df_sentiment = split_sentiment_df.withColumn(
        'label', get_label(split_sentiment_df['polarity']))

    training_set, validation_set = df_sentiment.randomSplit([0.75, 0.25])

    headline_vector_size = 3
    word_freq_vector_size = 100

    tokenizer = Tokenizer(inputCol='title_clean', outputCol='words')
    headline2Vector = Word2Vec(vectorSize=headline_vector_size,
                               minCount=0,
                               inputCol='words',
                               outputCol='headline_vector')
    hashingTF = HashingTF(inputCol='words',
                          outputCol='word_counts',
                          numFeatures=word_freq_vector_size)
    idf = IDF(inputCol='word_counts', outputCol='word_frequecy', minDocFreq=5)
    headline_vector_size_hint = VectorSizeHint(
        inputCol='headline_vector',
        size=headline_vector_size)  #need this for streaming
    word_freq_vector_size_hint = VectorSizeHint(
        inputCol='word_frequecy',
        size=word_freq_vector_size)  #need this for streaming
    feature_assembler = VectorAssembler(inputCols=[
        'headline_vector', 'score', 'num_comments', 'subjectivity',
        'word_frequecy'
    ],
                                        outputCol='features')
    dt_classifier = DecisionTreeClassifier(featuresCol='features',
                                           labelCol='label',
                                           predictionCol='prediction',
                                           maxDepth=9)

    pipeline = Pipeline(stages=[
        tokenizer, headline2Vector, hashingTF, idf, headline_vector_size_hint,
        word_freq_vector_size_hint, feature_assembler, dt_classifier
    ])
    sentiment_model = pipeline.fit(training_set)

    validation_predictions = sentiment_model.transform(validation_set)

    evaluator = MulticlassClassificationEvaluator(predictionCol='prediction',
                                                  labelCol='label')
    validation_score = evaluator.evaluate(validation_predictions)
    print('Validation score for Sentiment model F1: %g' % (validation_score, ))

    validation_score_accuracy = evaluator.evaluate(
        validation_predictions, {evaluator.metricName: "accuracy"})
    print('Validation score for Sentiment model Accuracy: %g' %
          (validation_score_accuracy, ))

    sentiment_model.write().overwrite().save(output_dir)
Beispiel #11
0
def base_features_gen_pipeline(input_descript_col="descript", input_category_col="category", output_feature_col="features", output_label_col="label"):
    tok = Tokenizer(inputCol=input_descript_col, outputCol="items")
    cv = CountVectorizer(inputCol="items", outputCol=output_feature_col)
    i = StringIndexer(inputCol=input_category_col, outputCol=output_label_col)
    return Pipeline(stages=[tok, cv, i])
Beispiel #12
0
    'hdfs://192.168.50.93:9000/user/hadoop/books2/amazon_reviews_us_Wireless_v1_00.tsv'
)
df0.printSchema()

#FILTERING FOR EMPTY VALUES
df01 = df0.filter((col("review_body").isNotNull())
                  & (col("verified_purchase").isNotNull()))

#ENCODING LABEL
stage_string = StringIndexer(inputCol="verified_purchase",
                             outputCol="class_res")
ppl = Pipeline(stages=[stage_string])
df1 = ppl.fit(df01).transform(df01)

#CREATING TF_IDF
tokenizer = Tokenizer(inputCol="review_body", outputCol="words")
wordsData = tokenizer.transform(df1)
hashingTF = HashingTF(inputCol="words",
                      outputCol="rawFeatures",
                      numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

#NAIVEBAYES
nb = NaiveBayes(featuresCol="features", labelCol="class_res")

#Model training
model = nb.fit(rescaledData)
Beispiel #13
0
timestart = datetime.datetime.now()

print("abstracts_full_df2.head() = {}".format(abstracts_full_df2.head()))

# Convert the content to Lower Case
print("Converting the abstarct to Lower Case ... ")
abstracts_full_df3 = abstracts_full_df2.withColumn("abstractNew", lower(col("abstract"))).\
    withColumn("abstractNew", regexp_replace("abstractNew", '[^\w-_ ]', ""))

abstracts_full_df3.printSchema()
# print("abstracts_full_df3.head() = {}".format(abstracts_full_df3.head()))

# Tokenize the Abstracts
print("tokenizating the abstracts... ")
tokenizer = Tokenizer(inputCol="abstractNew", outputCol="words")
remover = StopWordsRemover(inputCol="words", outputCol="filtWords")

abstracts_full_df4 = tokenizer.transform(abstracts_full_df3)

print("After tokenization: ")
abstracts_full_df4.printSchema()
print("abstracts_full_df4.count() = {}".format(abstracts_full_df4.count()))
# print("abstracts_full_df4.head() = {}".format(abstracts_full_df4.head()))

# PRINT HOW MUCH TIME IT TOOK TO RUN THE CELL
timeend = datetime.datetime.now()
timedelta = round((timeend - timestart).total_seconds() / 60, 2)
print("Time taken to execute above cell: " + str(timedelta) + " mins")

# ### Step 5
Beispiel #14
0
# Import the necessary functions
from pyspark.sql.functions import regexp_replace
from pyspark.ml.feature import Tokenizer

# Remove punctuation (REGEX provided) and numbers
wrangled = sms.withColumn('text',
                          regexp_replace(sms.text, '[_():;,.!?\\-]', ' '))
wrangled = wrangled.withColumn('text',
                               regexp_replace(wrangled.text, '[0-9]', ' '))

# Merge multiple spaces
wrangled = wrangled.withColumn('text', regexp_replace(wrangled.text, ' +',
                                                      ' '))

# Split the text into words
wrangled = Tokenizer(inputCol='text', outputCol='words').transform(wrangled)

wrangled.show(4, truncate=False)

#########################################################################################

#Stop words and hashing

from pyspark.ml.feature import StopWordsRemover, HashingTF, IDF

# Remove stop words.
wrangled = StopWordsRemover(inputCol='words', outputCol='terms')\
      .transform(sms)

# Apply the hashing trick
wrangled = HashingTF(inputCol='terms', outputCol='hash', numFeatures=1024)\
Beispiel #15
0
# Import library
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.sql import SparkSession, Window
import pyspark.sql.functions as F

# Create a spark session
spark = SparkSession.builder.appName("TfIdf-tokenizing").getOrCreate()

# Load and read input
documents = spark.read.text("dataset/*.txt")
documents = documents.withColumn("doc_id", F.row_number().over(Window.orderBy('value')))
print('The schema associated to the input is')
documents.printSchema()

# Tokenization of input (splitting the text into individual token/word)
tokenizer = Tokenizer(inputCol="value", outputCol="words")
wordsData = tokenizer.transform(documents)

# Computation of Term-Frequency (TF) associated with the data
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)

# Computation of Inverse Document Frequency (IDF) associated with the data
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

# Identifying the TF_IDF associated with the data
rescaledData.select("doc_id", "features").show(truncate=False)

# Close spark session
def divideAmount(totalAmount, totalSupporter):
    if(totalSupporter is None or totalAmount is None):
        return 0
    if (totalSupporter == 0 or totalAmount == 0):
        return 0
    price = int(totalAmount / totalSupporter)
    return price
    
price_udf = udf(dividePrice, IntegerType())
amount_udf = udf(divideAmount, IntegerType())
spark_df = spark_df.withColumn('rangeAmount', price_udf(spark_df['totalAmount'], spark_df['totalSupporter']))
spark_df = spark_df.withColumn('amount', amount_udf(spark_df['totalAmount'], spark_df['totalSupporter']))
spark_df = spark_df.withColumn('soop', spark_df['soop'].cast('string'))
spark_df.show()

tokenizer = Tokenizer(inputCol='soop', outputCol='keywords')
wordData = tokenizer.transform(spark_df)
word2Vec = Word2Vec(vectorSize=100, minCount=5, inputCol='keywords', outputCol='word_vec', seed=123)
word2VecData = word2Vec.fit(wordData)
word2VecData = word2VecData.transform(wordData)

# rangeAmount 비교값 추가
all_wadiz_vecs = word2VecData.select('id','word_vec','rangeAmount')


import pyspark.sql.functions as F
from pyspark.sql.functions import col
from pyspark.ml.feature import Normalizer
from pyspark.sql.column import Column, _to_java_column, _to_seq 

def cosinesimilarity_udf(a, b): 
Beispiel #17
0
                    outputCol="desc_vec")
w2v_model = word2Vec.fit(w2v_df)

train_df = w2v_model.transform(w2v_df)
train_df.display()

# COMMAND ----------

# MAGIC %md ###Step 2: Feature Transformation
# MAGIC Use Tokenizer to tokenize text into individual terms.

# COMMAND ----------

from pyspark.ml.feature import Tokenizer

tokenizer = Tokenizer(inputCol="description", outputCol="desc_terms")

tokenized_df = tokenizer.transform(preproc_data)
tokenized_df.select("description", "desc_terms").display()

# COMMAND ----------

# MAGIC %md **StopWordsRemover** for removing very commonly occuring words which do not carry much meaning.

# COMMAND ----------

from pyspark.ml.feature import StopWordsRemover

stops_remover = StopWordsRemover(inputCol="desc_terms",
                                 outputCol="desc_nostops")
stops_df = stops_remover.transform(tokenized_df)
Beispiel #18
0
from operator import add
from pyspark.sql import functions as F
from pyspark.ml import Pipeline
from pyspark.sql import SparkSession
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.feature import Word2Vec

spark = SparkSession.builder.appName("DataFrame").getOrCreate()

sc = spark.sparkContext

sc.setLogLevel("ERROR")

df = spark.read.json(
    '/home/khalid/exercise_7_for_first_term_students/tweets.json')

tokenizer = Tokenizer(inputCol="content", outputCol="words")
wordsData = tokenizer.transform(df)

new_words = wordsData.select('words')  # Taken from previous exercise

word2Vec = Word2Vec(vectorSize=3,
                    minCount=0,
                    inputCol="words",
                    outputCol="result")
model = word2Vec.fit(new_words)

result = model.transform(new_words)

result.select('result').show()  # Show top 1 rows
Beispiel #19
0
    #training set
    text_positive = sc.textFile("data/training_positif_clean.csv")
    text_negative = sc.textFile("data/training_negatif_clean.csv")

    pos_labels = text_positive.map(lambda x: 1.0).zip(
        text_positive.map(lambda x: x))
    neg_labels = text_negative.map(lambda x: 0.0).zip(
        text_negative.map(lambda x: x))

    pos_df = pos_labels.toDF(["label", "sentence"])
    neg_df = neg_labels.toDF(["label", "sentence"])

    text_df = pos_df.union(neg_df)

    tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
    wordsData = tokenizer.transform(text_df)

    #number of words
    nb_features = 10000
    print("\nDone : Tokenization training set")

    ###########################################################################
    #########             TF IDF Training Set                #########

    #training set
    hashingTF = HashingTF(inputCol="words",
                          outputCol="rawFeatures",
                          numFeatures=nb_features)
    featurizedData = hashingTF.transform(wordsData)
smsDf = sqlContext.createDataFrame(smsXformed, ["label", "message"])
smsDf.cache()
smsDf.select("label", "message").show()

#Split training and testing
(trainingData, testData) = smsDf.randomSplit([0.9, 0.1])
trainingData.count()
testData.count()
testData.collect()

#Setup pipeline
from pyspark.ml.classification import NaiveBayes, NaiveBayesModel
from pyspark.ml import Pipeline
from pyspark.ml.feature import HashingTF, Tokenizer
from pyspark.ml.feature import IDF

tokenizer = Tokenizer(inputCol="message", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), \
        outputCol="tempfeatures")
idf = IDF(inputCol=hashingTF.getOutputCol(), outputCol="features")
nbClassifier = NaiveBayes()

pipeline = Pipeline(stages=[tokenizer, hashingTF, \
                idf, nbClassifier])

nbModel = pipeline.fit(trainingData)

prediction = nbModel.transform(testData)
prediction.groupBy("label", "prediction").count().show()
def countIntersection(avec, bvec):
    count = 0
    for x in avec:
        if x in bvec:
            count += 1
    return count


countIntersection_udf = udf(lambda x, y: countIntersection(x, y),
                            IntegerType())

combined_list = nameList.select("pid", concat("name").alias("text"))

cleaned_list = combined_list.select("pid",
                                    clean_text(col("text")).alias("text"))
tokenizer = Tokenizer(inputCol="text", outputCol="vector")
vec_text = tokenizer.transform(cleaned_list).select("pid", "vector")
stemmed_text = vec_text.withColumn("text_stemmed",
                                   stemmer_udf("vector")).select(
                                       "pid", "text_stemmed")

callengeDF = spark.read.json(hdfs_challengeDir, multiLine=True)
challengePlaylist = callengeDF.select(explode("playlists").alias("playlists"))
challengeNameList = challengePlaylist.select("playlists.pid", "playlists.name")

challenge_combined_list = challengeNameList.select(
    "pid",
    concat("name").alias("text"))
challenge_cleaned_list = challenge_combined_list.select(
    "pid",
    clean_text(col("text")).alias("text")).filter("text is not null")
Beispiel #22
0
    else:
        return False


############################## SPARK ML PIPELINE + CALCULATING ACCURACY ################################
after_process = sqlContext.read.parquet(
    "hdfs:///user/prado/little_text_process1.parquet")
mlinterest = after_process.na.drop(subset=["sentiment"])
sent_value = udf(sentiment_values, IntegerType())
MLinterest = mlinterest.withColumn('label', sent_value(mlinterest.sentiment))
MLINTEREST = MLinterest.filter(MLinterest.lang == "en")
MLINTEREST1 = MLINTEREST.withColumn("label",
                                    MLINTEREST.label.cast(DoubleType()))

# Create pipeline
tokenizer = Tokenizer(inputCol="main", outputCol="words")
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures")
idf = IDF(inputCol="rawFeatures", outputCol="features")
nb = NaiveBayes()
pipeline = Pipeline(stages=[tokenizer, hashingTF, idf, nb])

# Separate train/test
train, test = MLINTEREST1.randomSplit([0.6, 0.4], 24)
train.cache()

# Train our model
model = pipeline.fit(train)

predictionAndLabels = model.transform(
    test.withColumnRenamed('label', 'true_label'))
wesh = predictionAndLabels.select('prediction', 'true_label').rdd
Beispiel #23
0
def main():
    spark = SparkSession.builder \
        .appName("Spark CV-job ad matching") \
        .config("spark.some.config.option", "some-value") \
        .master("local[*]") \
        .getOrCreate()

    NUM_FEATURES = 2**8

    df_jobs = spark.read.json("alljobs4rdd/alljobs.jsonl").filter("description is not NULL").cache()
    df_jobs.registerTempTable("jobs")
    df_cvs = spark.read.json("allcvs4rdd/allcvs.jsonl").cache()
    df_cvs.registerTempTable("cvs")
    df_categories = spark.read.json("allcategories4rdd/allcategories.jsonl").cache()
    df_categories.registerTempTable("categories")

    joined = spark.sql("SELECT description AS text, jobId AS id, 'job' AS type FROM jobs UNION ALL \
               SELECT description AS text, cvid AS id, 'cv' AS type FROM cvs UNION ALL \
               SELECT skillText AS text, id AS id, 'categories' AS type FROM categories")

    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    tokenized = tokenizer.transform(joined)

    remover = StopWordsRemover(inputCol="words", outputCol="filtered")
    removed = remover.transform(tokenized)

    hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=NUM_FEATURES)
    featurizedData = hashingTF.transform(removed)

    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)

    rescaledData.registerTempTable("resultTable")
    jobs = spark.sql("SELECT features, id AS jobId FROM resultTable WHERE type = 'job'")
    cvs = spark.sql("SELECT features AS featuresCV, id AS cvid FROM resultTable WHERE type = 'cv'")
    categories = spark.sql("SELECT features AS featuresCAT, cat.id, cat.skillName AS skillName, category FROM resultTable AS rt\
    LEFT JOIN categories AS cat ON rt.id = cat.id WHERE type = 'categories'")

    #Calculate job-cv similarity START
    crossJoined = jobs.select("jobId", "features").crossJoin(cvs.select("cvid", "featuresCV"))
    calculatedDF = crossJoined.rdd.map(lambda x: (x.jobId, x.cvid, calculate_distance(x.features, x.featuresCV)))\
    .toDF(["jobid", "cvid", "distance"])
    ordered = calculatedDF.orderBy(asc("jobid")).coalesce(2)
    ordered.write.csv('Calculated/tfidf/job-cv')
    #Calculate job-cv similarity END

    #Calculate cv-category similarity START
    crossJoined_cat_cv = cvs.select("cvid", "featuresCV").crossJoin(categories.select("id", "skillName", "featuresCAT", "category"))
    calculatedDF_cat_cv = crossJoined_cat_cv.rdd\
    .map(lambda x: (x.cvid, x.id, x.skillName, x.category, calculate_distance(x.featuresCV, x.featuresCAT)))\
    .toDF(["cvid", "catid", "skillName", "category", "distance"])
    ordered_cat_cv = calculatedDF_cat_cv.orderBy(asc("cvid"), asc("distance")).coalesce(2)
    ordered_cat_cv.write.csv('Calculated/tfidf/cv-category')
    #Calculate cv-category similarity END

    #Job-category START
    crossJoined_job_cat = jobs.select("jobId", "features").crossJoin(categories.select("id", "skillName", "featuresCAT", "category"))
    calculatedDF_job_cat = crossJoined_job_cat.rdd\
    .map(lambda x: (x.jobId, x.id, x.skillName, x.category, calculate_distance(x.features, x.featuresCAT)))\
    .toDF(["jobid", "catid", "skillName", "category", "distance"])
    ordered_job_cat = calculatedDF_job_cat.orderBy( asc("distance")).coalesce(2)
    ordered_job_cat.write.csv('Calculated/tfidf/job-category')
Beispiel #24
0
plt.tight_layout(pad=0)
plt.show()

# In[102]:

ham_words = ' '.join(list(df[df['label'] == 0]['message']))
ham_wc = WordCloud(width=512, height=512).generate(ham_words)
plt.figure(figsize=(10, 8), facecolor='k')
plt.imshow(ham_wc)
plt.axis('off')
plt.tight_layout(pad=0)
plt.show()

# In[44]:

tokenizer = Tokenizer(inputCol="message", outputCol="tokenized")

# In[45]:

hasher = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="frequency")

# In[46]:

idf = IDF(inputCol=hasher.getOutputCol(), outputCol="features")

# In[47]:

from pyspark.ml.classification import RandomForestClassifier

# In[48]:
Beispiel #25
0
# Construct a pipeline
pipeline = Pipeline(stages=[indexer, onehot, assembler, regression])

# Train the pipeline on the training data
pipeline = pipeline.fit(flights_train)

# Make predictions on the testing data
predictions = pipeline.transform(flights_test)

--------------------------------------------------
# Exercise_3 
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF

# Break text into tokens at non-word characters
tokenizer = Tokenizer(inputCol='text', outputCol='words')

# Remove stop words
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='terms')

# Apply the hashing trick and transform to TF-IDF
hasher = HashingTF(inputCol=remover.getOutputCol(), outputCol="hash")
idf = IDF(inputCol=hasher.getOutputCol(), outputCol="features")

# Create a logistic regression object and add everything to a pipeline
logistic = LogisticRegression()
pipeline = Pipeline(stages=[tokenizer, remover, hasher, idf, logistic])

--------------------------------------------------
# Exercise_4 
# Create an empty parameter grid
Beispiel #26
0
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, CountVectorizer, StopWordsRemover
from pyspark.ml.clustering import KMeans
#Check if all the params were passed
if (len(sys.argv) > 5):
    #Setup the sparkContext
    sc = SparkContext(appName="SparkClustering-emonto15-dperezg1")
    spark = SparkSession(sc)
    #Read from hdfs and save using a schema (path,text)
    files = sc.wholeTextFiles("hdfs://" + sys.argv[1])
    schema = StructType([
        StructField("path", StringType(), True),
        StructField("text", StringType(), True)
    ])
    df = spark.createDataFrame(files, schema)
    #Divide the text into an array of words
    tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
    #Setup the language to remove the stopwords
    StopWordsRemover.loadDefaultStopWords(sys.argv[4])
    #Read from column tokens (which is the output of the tokenizer object) and save a new array of words without the stopwords
    stopWords = StopWordsRemover(inputCol="tokens",
                                 outputCol="stopWordsRemovedTokens")
    #Creates a hash of each word and the frecuency on each document and only takes the number of words established on the numFeatures parameter
    hashingTF = HashingTF(inputCol="stopWordsRemovedTokens",
                          outputCol="rawFeatures",
                          numFeatures=int(sys.argv[3]))
    #Calculates the inverse document frecuency, and ignore a word if  well explained on the code's article
    idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=1)
    #Initialize the kmeans with a specific K
    kmeans = KMeans(k=int(sys.argv[2]))
    #Declare the assambly line to transform the dataset
    #creacion del mapa de transformaciones
Beispiel #27
0
def main():
    spark = SparkSession.builder.appName('AmazonReviewsSparkProcessor').getOrCreate()
    
    # Convert command line args into a map of args
    args_iter = iter(sys.argv[1:])
    args = dict(zip(args_iter, args_iter))
    
    # Retrieve the args and replace 's3://' with 's3a://' (used by Spark)
    s3_input_data = args['s3_input_data'].replace('s3://', 's3a://')
    print(s3_input_data)
    
    s3_output_data = args['s3_output_data'].replace('s3://', 's3a://')
    print(s3_output_data)
    
    schema = StructType([
        StructField('is_positive_sentiment', IntegerType(), True),
        StructField('marketplace', StringType(), True),
        StructField('customer_id', StringType(), True),
        StructField('review_id', StringType(), True),
        StructField('product_id', StringType(), True),
        StructField('product_parent', StringType(), True),
        StructField('product_title', StringType(), True),
        StructField('product_category', StringType(), True),
        StructField('star_rating', IntegerType(), True),
        StructField('helpful_votes', IntegerType(), True),
        StructField('total_votes', IntegerType(), True),
        StructField('vine', StringType(), True),
        StructField('verified_purchase', StringType(), True),
        StructField('review_headline', StringType(), True),
        StructField('review_body', StringType(), True),
        StructField('review_date', StringType(), True)
    ])
    
    df_csv = spark.read.csv(path=s3_input_data,
                            schema=schema,
                            header=True,
                            quote=None)
    df_csv.show()

    # This dataset should already be clean, but always good to double-check
    print('Showing null review_body rows...')
    df_csv.where(col('review_body').isNull()).show()

    df_csv_cleaned = df_csv.na.drop(subset=['review_body'])
    df_csv_cleaned.where(col('review_body').isNull()).show()
   
    tokenizer = Tokenizer(inputCol='review_body', outputCol='words')
    wordsData = tokenizer.transform(df_csv_cleaned)
    
    hashingTF = HashingTF(inputCol='words', outputCol='raw_features', numFeatures=1000)
    featurizedData = hashingTF.transform(wordsData)
    
    # While applying HashingTF only needs a single pass to the data, applying IDF needs two passes:
    # 1) compute the IDF vector 
    # 2) scale the term frequencies by IDF
    # Therefore, we cache the result of the HashingTF transformation above to speed up the 2nd pass
    featurizedData.cache()

    # spark.mllib's IDF implementation provides an option for ignoring terms
    # which occur in less than a minimum number of documents.
    # In such cases, the IDF for these terms is set to 0.
    # This feature can be used by passing the minDocFreq value to the IDF constructor.
    idf = IDF(inputCol='raw_features', outputCol='features') #, minDocFreq=2)
    idfModel = idf.fit(featurizedData)
    features_df = idfModel.transform(featurizedData)
    features_df.select('is_positive_sentiment', 'features').show()

    # TODO:  Use SVD instead
    # features_vector_rdd = features_df.select('features').rdd.map( lambda row: Vectors.fromML(row.getAs[MLVector]('features') )
    # features_vector_rdd.cache()
    # mat = RowMatrix(features_vector_rdd)
    # k = 300
    # svd = mat.computeSVD(k, computeU=True)
    # TODO:  Reconstruct

    num_features=300
    pca = PCA(k=num_features, inputCol='features', outputCol='pca_features')
    pca_model = pca.fit(features_df)
    pca_features_df = pca_model.transform(features_df).select('is_positive_sentiment', 'pca_features')
    pca_features_df.show(truncate=False)

    standard_scaler = StandardScaler(inputCol='pca_features', outputCol='scaled_pca_features')
    standard_scaler_model = standard_scaler.fit(pca_features_df)
    standard_scaler_features_df = standard_scaler_model.transform(pca_features_df).select('is_positive_sentiment', 'scaled_pca_features')
    standard_scaler_features_df.show(truncate=False)

    expanded_features_df = (standard_scaler_features_df.withColumn('f', to_array(col('scaled_pca_features')))
        .select(['is_positive_sentiment'] + [col('f')[i] for i in range(num_features)]))
    expanded_features_df.show()

    # Remover overwrite to test for this issue
    #    https://stackoverflow.com/questions/51050591/spark-throws-java-io-ioexception-failed-to-rename-when-saving-part-xxxxx-gz
    expanded_features_df.write.csv(path=s3_output_data,
                       header=None,
                       quote=None) #,
#                       mode='overwrite')

    print('Wrote to output file:  {}'.format(s3_output_data))
Beispiel #28
0
    when(col("product_description").isNull(),
         "empty").otherwise(col("product_description")))
# combine relevant fields (product_title, attr_list, product_description, search_term) as one named `info`
train = train.select(
    "product_uid1", "search_term",
    concat(col("product_title"), lit(' '), col("attr_list"), lit(' '),
           col("product_description"), lit(' '),
           col("search_term")).alias("info")).orderBy("product_uid1")
train.show()
# train.printSchema()
# train.write.option("header", 'true').csv(path + 'saved_file.csv')

from pyspark.ml.feature import StopWordsRemover, Tokenizer, Word2Vec

# tokenize `info` and remove stopwords in unioned data
tokenizer = Tokenizer(inputCol="info", outputCol="tokenized_info")
tokenized = tokenizer.transform(train).drop("info")
tokenized.show()
# remover = StopWordsRemover(inputCol="tokenized_info", outputCol="filtered_info")
# removed = remover.transform(tokenized).drop("tokenized_info").na.drop()
# print removed.count()

# use union tokenized
word2Vec = Word2Vec(vectorSize=3,
                    minCount=5,
                    inputCol="tokenized_info",
                    outputCol="vec")
model = word2Vec.fit(tokenized)
# model.getVectors().show()
# tokenized search terms in train.csv
term_tokenizer = Tokenizer(inputCol="search_term", outputCol="term_token")
Beispiel #29
0
if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("PipelineExample")\
        .getOrCreate()

    # $example on$
    # Prepare training documents from a list of (id, text, label) tuples.
    training = spark.createDataFrame([(0L, "a b c d e spark", 1.0),
                                      (1L, "b d", 0.0),
                                      (2L, "spark f g h", 1.0),
                                      (3L, "hadoop mapreduce", 0.0)],
                                     ["id", "text", "label"])

    # Configure an ML pipeline, which consists of three stages: tokenizer, hashingTF, and lr.
    tokenizer = Tokenizer(inputCol="text", outputCol="words")
    hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(),
                          outputCol="features")
    lr = LogisticRegression(maxIter=10, regParam=0.01)
    pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])

    # Fit the pipeline to training documents.
    model = pipeline.fit(training)

    # Prepare test documents, which are unlabeled (id, text) tuples.
    test = spark.createDataFrame([(4L, "spark i j k"), (5L, "l m n"),
                                  (6L, "mapreduce spark"),
                                  (7L, "apache hadoop")], ["id", "text"])

    # Make predictions on test documents and print columns of interest.
    prediction = model.transform(test)
Beispiel #30
0
    'SentimentSource',
]
train = train.select(
    [column for column in train.columns if column not in drop_list])
train = train.withColumn("label", train["label"].cast("TINYINT"))

#removes null rows
train = train.na.drop()

#Load test data
test = spark.read.format("csv").option("header",
                                       "true").option("delimiter",
                                                      ";").load("cloud.csv")

#tokenizer
trainTokenizer = Tokenizer(inputCol="SentimentText;;;;;;;;;;;;;;;;;;;;;;;;",
                           outputCol="words")
trainCountTokens = udf(lambda words: len(words), IntegerType())
train = trainTokenizer.transform(train)

testTokenizer = Tokenizer(inputCol="text", outputCol="words")
testCountTokens = udf(lambda words: len(words), IntegerType())
test = testTokenizer.transform(test)

#Remove stopwords
trainRemover = StopWordsRemover(inputCol="words", outputCol="filtered")
train = trainRemover.transform(train)

testRemover = StopWordsRemover(inputCol="words", outputCol="filtered")
test = testRemover.transform(test)

# fit CountVectorizerModel