def textPredict(request):
    """6.文本聚类,热度预测"""
    label = request.POST['label']
    title = request.POST['title']

    conf = SparkConf().setAppName('textPredict').setMaster('spark://HP-Pavilion:7077')
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    """处理数据集,生成特征向量"""
    dfTitles = sqlContext.read.parquet('data/roll_news_sina_com_cn.parquet')
    print(dfTitles.dtypes)
    tokenizer = Tokenizer(inputCol="title", outputCol="words")
    wordsData = tokenizer.transform(dfTitles)
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)
    rescaledData.show()
    for features_label in rescaledData.select("features", "rawFeatures").take(3):
        print(features_label)
    """决策树模型培训"""
    labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(rescaledData)
    featureIndexer =\
        VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(rescaledData)
    (trainingData, testData) = rescaledData.randomSplit([0.7, 0.3])
    dt = DecisionTreeClassifier(labelCol="indexedLabel", featuresCol="indexedFeatures")
    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, dt])
    model = pipeline.fit(trainingData)
    """模型测试"""
    predictions = model.transform(testData)
    predictions.show()
    predictions.select("prediction", "indexedLabel", "features").show(5)
    """用户数据测试,单个新闻测试"""
    sentenceData = sqlContext.createDataFrame([
        (label,title),
    ],['label',"title"])
    tokenizer = Tokenizer(inputCol="title", outputCol="words")
    wordsData = tokenizer.transform(sentenceData)
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    rescaledData = idfModel.transform(featurizedData)
    myprediction = model.transform(rescaledData)
    print("==================================================")
    myprediction.show()
    resultList = convertDfToList(myprediction)

    """模型评估"""
    evaluator = MulticlassClassificationEvaluator(
        labelCol="indexedLabel", predictionCol="prediction", metricName="precision")
    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g " % (1.0 - accuracy))

    treeModel = model.stages[2]
    print(treeModel)

    sc.stop()
    return render(request,{'resultList':resultList})
def main():
    spark = SparkSession.builder.appName("DBPediaSpark").getOrCreate()

    args = getResolvedOptions(sys.argv, ['S3_INPUT_BUCKET',
                                         'S3_INPUT_KEY_PREFIX',
                                         'S3_OUTPUT_BUCKET',
                                         'S3_OUTPUT_KEY_PREFIX',
                                         'S3_MODEL_BUCKET',
                                         'S3_MODEL_KEY_PREFIX'])

    # This is needed to save RDDs which is the only way to write nested Dataframes into CSV format
    spark.sparkContext._jsc.hadoopConfiguration().set("mapred.output.committer.class",
                                                      "org.apache.hadoop.mapred.FileOutputCommitter")
    
    # Defining the schema corresponding to the input data. The input data does not contain the headers
    schema = StructType([StructField("label", IntegerType(), True), 
                         StructField("title", StringType(), True), 
                         StructField("abstract", StringType(), True)])
    
    # Download the data from S3 into two separate Dataframes
    traindf = spark.read.csv(('s3://' + os.path.join(args['S3_INPUT_BUCKET'], args['S3_INPUT_KEY_PREFIX'],
                                                   'train.csv')), header=False, schema=schema, encoding='UTF-8')
    validationdf = spark.read.csv(('s3://' + os.path.join(args['S3_INPUT_BUCKET'], args['S3_INPUT_KEY_PREFIX'],
                                                          'test.csv')), header=False, schema=schema, encoding='UTF-8')

    # Tokenize the abstract column which contains the input text
    tokenizer = Tokenizer(inputCol="abstract", outputCol="tokenized_abstract")

    # Save transformed training data to CSV in S3 by converting to RDD.
    transformed_traindf = tokenizer.transform(traindf)
    transformed_train_rdd = transformed_traindf.rdd.map(lambda x: (x.label, x.tokenized_abstract))
    lines = transformed_train_rdd.map(csv_line)
    lines.coalesce(1).saveAsTextFile('s3://' + os.path.join(args['S3_OUTPUT_BUCKET'], args['S3_OUTPUT_KEY_PREFIX'], 'train'))

    # Similar data processing for validation dataset.
    transformed_validation = tokenizer.transform(validationdf)
    transformed_validation_rdd = transformed_validation.rdd.map(lambda x: (x.label, x.tokenized_abstract))
    lines = transformed_validation_rdd.map(csv_line)
    lines.coalesce(1).saveAsTextFile('s3://' + os.path.join(args['S3_OUTPUT_BUCKET'], args['S3_OUTPUT_KEY_PREFIX'], 'validation'))

    # Serialize the tokenizer via MLeap and upload to S3
    SimpleSparkSerializer().serializeToBundle(tokenizer, "jar:file:/tmp/model.zip", transformed_validation)

    # Unzip as SageMaker expects a .tar.gz file but MLeap produces a .zip file.
    import zipfile
    with zipfile.ZipFile("/tmp/model.zip") as zf:
        zf.extractall("/tmp/model")

    # Write back the content as a .tar.gz file
    import tarfile
    with tarfile.open("/tmp/model.tar.gz", "w:gz") as tar:
        tar.add("/tmp/model/bundle.json", arcname='bundle.json')
        tar.add("/tmp/model/root", arcname='root')

    s3 = boto3.resource('s3')
    file_name = os.path.join(args['S3_MODEL_KEY_PREFIX'], 'model.tar.gz')
    s3.Bucket(args['S3_MODEL_BUCKET']).upload_file('/tmp/model.tar.gz', file_name)
def token(dataframe, in_col, out_col):
    
    tokenizer = Tokenizer(inputCol=in_col, outputCol=out_col)
    dataframe = tokenizer.transform(dataframe)
    
    dataframe.printSchema()
    
    return dataframe
Example #4
0
def run_tf_idf_spark_ml(df, numFeatures=1 << 20):
    tokenizer = Tokenizer(inputCol="body", outputCol="words")
    wordsData = tokenizer.transform(df)

    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=numFeatures)
    featurizedData = hashingTF.transform(wordsData)

    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)

    return idfModel.transform(featurizedData)
def predictLabel(label,title,model):
    """预测新闻的标签"""
    sentenceData = sqlContext.createDataFrame([
        (label,title),
    ],['label',"title"])
    tokenizer = Tokenizer(inputCol="title", outputCol="words")
    wordsData = tokenizer.transform(sentenceData)
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    rescaledData = idfModel.transform(featurizedData)
    myprediction = model.transform(rescaledData)
    return myprediction
def create_features(raw_data):
    #Create DataFrame
    data_df = sqlContext.createDataFrame(raw_data.map(lambda r : Row(appid=r[0], price=r[1], sentence=r[2])))
    #Transform sentence into words
    tokenizer = Tokenizer(inputCol='sentence', outputCol='words')
    words_df = tokenizer.transform(data_df)
    #Calculate term frequency
    hashingTF = HashingTF(inputCol='words', outputCol='rawFeatures', numFeatures=5)
    featurized_df = hashingTF.transform(words_df)
    #Calculate inverse document frequency
    idf = IDF(inputCol='rawFeatures', outputCol='features')
    idfModel = idf.fit(featurized_df)
    return idfModel.transform(featurized_df)
def preprocessing_titles(path,name):
    query = preprocessData(path)
    tokenizer = Tokenizer(inputCol="title", outputCol="tokenized_title")
    wordsData = tokenizer.transform(query)
    #after Stopword removal
    remover = StopWordsRemover(inputCol="tokenized_title", outputCol="filtered")
    wordsData= remover.transform(wordsData)
    
    df = wordsData.map(lambda x:x['id']).zipWithUniqueId().toDF(["id","index"])
    df.registerTempTable("indices")
    wordsData.registerTempTable("words")
    
    qr = sqlContext.sql("SELECT index,words.id,filtered FROM indices JOIN words ON words.id = indices.id")
    if name!='':
        exportOnS3(qr,"s3a://redit-preprocessed/",name)
    qr = qr.map(lambda Row:(Row['index'],Row['id'],Row['filtered']))
Example #8
0
def get_top_words(dataset, signatures):
    # TODO: Use stemmers for the languages supported by http://www.nltk.org/api/nltk.stem.html#nltk.stem.snowball.SnowballStemmer
    # Or translate comments in other languages using the free Microsoft Translate API.
    sentenceData = dataset.filter(dataset['user_comments'].isNotNull() & (dataset['useragent_locale'].isNull() | (functions.instr(dataset['useragent_locale'], 'en') == 1)))

    if sentenceData.rdd.isEmpty():
        return dict()

    # Tokenize comments.
    tokenizer = Tokenizer(inputCol='user_comments', outputCol='words')
    wordsData = tokenizer.transform(sentenceData)

    # Remove duplicate words from comments.
    wordsData = wordsData.rdd.map(lambda p: (p['signature'], list(set(p['words'])))).reduceByKey(lambda x, y: x + y).toDF(['signature', 'words'])

    if wordsData.rdd.isEmpty():
        print("[WARNING]: wordsData is empty, sentenceData wasn't.")
        return dict()

    # Clean comment words by removing puntuaction and stemming.
    def clean_word(w):
        return re.sub('\,|\.|\;|\:|\;|\?|\!|\[|\]|\}|\{|\/|\\\\', '', stem(w.lower()))

    wordsData = wordsData.rdd.map(lambda p: (p['signature'], [clean_word(w) for w in p['words']])).toDF(['signature', 'words'])

    # XXX: Useless with TF-IDF?
    remover = StopWordsRemover(inputCol='words', outputCol='filtered')
    cleanWordsData = remover.transform(wordsData)

    cv = CountVectorizer(inputCol='filtered', outputCol='features')
    model = cv.fit(cleanWordsData)
    featurizedData = model.transform(cleanWordsData)

    idf = IDF(inputCol='features', outputCol='tfidf_features')
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)

    bests_per_doc = rescaledData.filter(rescaledData.signature.isin(signatures)).rdd.map(lambda p: (p['signature'], sorted(zip(p['tfidf_features'].indices, p['tfidf_features'].values), key=lambda i: i[1], reverse=True)[:10])).collect()

    return dict([(signature, [model.vocabulary[best] for best, val in bests]) for signature, bests in bests_per_doc])
Example #9
0
def run_tf_idf_spark_mllib(df, numFeatures=1 << 20):
    tokenizer = Tokenizer(inputCol="body", outputCol="words")
    wordsData = tokenizer.transform(df)

    words = wordsData.select("words").rdd.map(lambda x: x.words)

    hashingTF = MllibHashingTF(numFeatures)
    tf = hashingTF.transform(words)
    tf.cache()

    idf = MllibIDF().fit(tf)
    tfidf = idf.transform(tf)

    # @TODO make this nicer
    tmp = sqlContext.createDataFrame(wordsData.rdd.zip(tfidf), ["data", "features"])
    tmp.registerTempTable("tmp")
    old_columns = ', '.join(map(lambda x: 'data.%s' % x, wordsData.columns))
    with_features = sqlContext.sql("SELECT %s, features FROM tmp" % old_columns)
    tmp = sqlContext.createDataFrame(with_features.rdd.zip(tf), ["data", "rawFeatures"])
    tmp.registerTempTable("tmp")
    old_columns = ', '.join(map(lambda x: 'data.%s' % x, with_features.columns))
    return sqlContext.sql("SELECT %s, rawFeatures FROM tmp" % old_columns)
Example #10
0
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession.builder.appName("TokenizerExample").getOrCreate()

    # $example on$
    sentenceDataFrame = spark.createDataFrame(
        [(0, "Hi I heard about Spark"),
         (1, "I wish Java 12 2 could use case classes"),
         (2, "Logistic,regression,models,are,neat")], ["id", "sentence"])

    tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

    regexTokenizer = RegexTokenizer(inputCol="sentence",
                                    outputCol="words",
                                    pattern="\\W")
    # alternatively, pattern="\\w+", gaps(False)

    countTokens = udf(lambda words: len(words), IntegerType())

    tokenized = tokenizer.transform(sentenceDataFrame)
    tokenized.select("sentence", "words")\
        .withColumn("tokens", countTokens(col("words"))).show(truncate=False)

    regexTokenized = regexTokenizer.transform(sentenceDataFrame)
    regexTokenized.select("sentence", "words") \
        .withColumn("tokens", countTokens(col("words"))).show(truncate=False)
    print("Apa yang disini")
    # $example off$

    spark.stop()
Example #11
0
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("TokenizerExample")\
        .getOrCreate()

    # $example on$
    sentenceDataFrame = spark.createDataFrame([
        (0, "Hi I heard about Spark"),
        (1, "I wish Java could use case classes"),
        (2, "Logistic,regression,models,are,neat")
    ], ["label", "sentence"])

    tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

    regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W")
    # alternatively, pattern="\\w+", gaps(False)

    tokenized = tokenizer.transform(sentenceDataFrame)
    for words_label in tokenized.select("words", "label").take(3):
        print(words_label)

    regexTokenized = regexTokenizer.transform(sentenceDataFrame)
    for words_label in regexTokenized.select("words", "label").take(3):
        print(words_label)
    # $example off$

    spark.stop()
Example #12
0
	(20,"apple iphone 6 tmobile 16gb"),
	(20,"Apple iPhone 6 (T Mobile) 16GB"),
	(20,"apple iphone 6 16gb t mobile"),
	(20,"Apple iPhone Apple iPhone 6 16GB 412 2 cell 2895"),
	(20,"iPhone 6 T Mobile 16 GB"),
	(20,"Apple 6 16gb T Mobile")
], ["label","text"])

# Learn a mapping from words to Vectors.
#word2Vec = Word2Vec(vectorSize=3, minCount=0, inputCol="text", outputCol="textVec")
#model = word2Vec.fit(documentDF)
#result = model.transform(documentDF)
#print result.take(2)

tokenizer = Tokenizer(inputCol="text", outputCol="tokenizedText")
tokenizedTextData = tokenizer.transform(documentDF)

hashingTF = HashingTF(inputCol="tokenizedText", outputCol="rawFeatures")
featurizedData = hashingTF.transform(tokenizedTextData)

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
result1 = idfModel.transform(featurizedData)


for features_label in result.select("label","pcaFeatures").take(10):
  print(features_label)


wordsvectors = result["label","features"].map(lambda row: LabeledPoint(row[0], row[1]))

# COMMAND ----------

from pyspark.ml.feature import OneHotEncoder, StringIndexer
lblIndxr = StringIndexer().setInputCol("color").setOutputCol("colorInd")
colorLab = lblIndxr.fit(simpleDF).transform(simpleDF.select("color"))
ohe = OneHotEncoder().setInputCol("colorInd")
ohe.transform(colorLab).show()


# COMMAND ----------

from pyspark.ml.feature import Tokenizer
tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut")
tokenized = tkn.transform(sales.select("Description"))
tokenized.show(20, False)


# COMMAND ----------

from pyspark.ml.feature import RegexTokenizer
rt = RegexTokenizer()\
  .setInputCol("Description")\
  .setOutputCol("DescOut")\
  .setPattern(" ")\
  .setToLowercase(True)
rt.transform(sales.select("Description")).show(20, False)


# COMMAND ----------
tokenizer = Tokenizer(inputCol='text', outputCol='token_text')
stop_remove = StopWordsRemover(inputCol='token_text', outputCol='stop_token')
count_vec = CountVectorizer(inputCol='stop_token', outputCol='c_vec')
idf = IDF(inputCol='c_vec', outputCol='tf_idf')
# We convert 'spam' and 'ham' into numeric features (eros and ones)
classes_to_numeric = StringIndexer(inputCol='class', outputCol='label')

from pyspark.ml.feature import VectorAssembler
data_features = VectorAssembler(inputCols=['tf_idf', 'length'],
                                outputCol='features')

data = data.replace(['spam', 'ham'], ['1', '0'])
data = data.withColumn('class_num', data['class'].cast('float'))

data_1 = tokenizer.transform(data)
data_1 = stop_remove.transform(data_1)
data_1 = count_vec.fit(data_1).transform(data_1)
data_1 = idf.fit(data_1).transform(data_1)
data_1 = data_features.transform(data_1)

data_final = data_1.select('class', 'class_num', 'features')
print('Final data set up'.upper())
data_final.show()

train_data, test_data = data_final.randomSplit([0.7, 0.3])
# Whatever ml classification model can be used here
from pyspark.ml.classification import LogisticRegression
log_reg = LogisticRegression(labelCol='class_num', featuresCol='features')
spam_detector = log_reg.fit(train_data)
test_results = spam_detector.transform(test_data)
# -*- coding: utf-8 -*-
from pyspark import SparkConf, SparkContext
from pyspark.sql import SQLContext, Row, DataFrame
from pyspark.ml.feature import HashingTF, IDF, Tokenizer

conf = SparkConf().setAppName("tfidf").setMaster("spark://HP-Pavilion:7077")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)

dfTitles = sqlContext.read.parquet("roll_news_sina_com_cn.parquet")
print(dfTitles.dtypes)
tokenizer = Tokenizer(inputCol="title", outputCol="words")
wordsData = tokenizer.transform(dfTitles)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
rescaledData.show()
for features_label in rescaledData.select("features", "rawFeatures").take(3):
    print(features_label)

sc.stop()
Example #16
0
info = lines.filter(lambda line: ' : ' in line or ' ----- ' in line).map(
    log_to_row)

df = spark.createDataFrame(
    info,
    "date_str string, thread_num int, operation int, level string, context string"
)


#######################################################################
def words_padding(x):
    return x.strftime("%b %d %H")


tokenizer = Tokenizer(inputCol="context", outputCol="context_words")
df = tokenizer.transform(df)

# indexer = StringIndexer(inputCol="context_words", outputCol="context_words_label")
# indexed = indexer.fit(df).transform(df)


#####################################################################
# 对日志内容进行编码,目前未使用,发现fp growth不用编码也可以使用
def context_process(x):
    return x.split(' ')[0]


# dataframe的映射需要用withColumn或select + udf,select 需要重命名
context_process = udf(context_process, StringType())
df = df.withColumn('one_context', context_process(df['context']))
Example #17
0
from pyspark.sql import SQLContext
from pyspark.sql.functions import desc, explode
from pyspark.sql.types import *
from storage import Sqlite

PARTITIONS = 500
THRESHOLD = 50

if __name__ == "__main__":
    conf = SparkConf().setAppName("reddit")
    conf.set('spark.serializer', 'org.apache.spark.serializer.KryoSerializer')
    conf.set('spark.local.dir', '/mnt/work')
    conf.set('spark.driver.maxResultSize', '12g')
    sc = SparkContext(conf=conf)
    sqlContext = SQLContext(sc)
    fields = [StructField("subreddit", StringType(), True),
          StructField("body", StringType(), True)]
    rawDF = sqlContext.read.json("file:///mnt/s3/2015/*", StructType(fields))
    # split comments into words
    tokenizer = Tokenizer(inputCol="body", outputCol="words")
    wordsDataFrame = tokenizer.transform(rawDF)

    remover = StopWordsRemover(inputCol="words", outputCol="filtered")
    filteredDataFrame = remover.transform(wordsDataFrame)
    # explode terms into individual rows
    termDataFrame = filteredDataFrame.select(['subreddit', explode(filteredDataFrame.filtered).alias("term")])
    # group by subreddit and term, then count occurence of term in subreddits
    countsDataFrame = termDataFrame.groupBy(['subreddit', 'term']).count()

    db =  Sqlite()
    countsDataFrame.select(['subreddit', 'term', 'count']).filter('count > {}'.format(THRESHOLD)).foreachPartition(db.saveSubredditWords)
Example #18
0
from __future__ import print_function
from pyspark.ml.feature import HashingTF, IDF, Tokenizer, NGram
from pyspark.sql import SparkSession

# Create the Spark session
spark = SparkSession.builder.appName("Ngrams").getOrCreate()

# Create the dataframe with five text abstracts
abstracts = spark.read.text('abs*.txt')

# Tokenize the abstract texts
tokenizer = Tokenizer(inputCol="value", outputCol="words")
wordsData = tokenizer.transform(abstracts)

# Creating n-grams with n=5
ngram = NGram(n=5, inputCol="words", outputCol="ngrams")
ngramDataFrame = ngram.transform(wordsData)

# Apply topic frequency on the abstracts
hashingTF = HashingTF(inputCol="words",
                      outputCol="rawFeatures",
                      numFeatures=40)
featurizedData = hashingTF.transform(ngramDataFrame)

# Calculate the inverse document frequency
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

# Display the results
rescaledData.select("features").show(20, truncate=False)
Example #19
0
     .builder \
     .appName("ml_classification") \
     .getOrCreate()
     
 
 
 ###########################################################################
 #########        Tokenizing Training and Test Set                #########
 
     
 #test_set
 test_text = sc.textFile("data/test_clean"+ str(part) + ".csv")
 test_df = test_text.map(lambda x : (0,x)).toDF(["nothing" , "sentence"]) #(0,x) = bricolage
 
 tokenizer_test = Tokenizer(inputCol="sentence", outputCol="words")
 wordsData_test = tokenizer_test.transform(test_df)
 
 df_test = wordsData_test
 nb_features_test = df_test.rdd.map(lambda x: len(x["words"])).sum()
 
 #training set
 text_positive = sc.textFile("data/training_positif_clean.csv")
 text_negative = sc.textFile("data/training_negatif_clean.csv")
 
 pos_labels = text_positive.map(lambda x: 1.0).zip(text_positive.map(lambda x : x))
 neg_labels = text_negative.map(lambda x: 0.0).zip(text_negative.map(lambda x : x))
 
 pos_df = pos_labels.toDF(["label" , "sentence"])
 neg_df = neg_labels.toDF(["label" , "sentence"])
 
 text_df = neg_df.union(pos_df)
def get_trending_news(rdd):
    if not rdd.isEmpty():
        spark = getSparkSessionInstance(rdd.context.getConf())

        df = spark.createDataFrame(rdd)

        # Append the title and summary together
        df_news_concat = df.withColumn("news_content",
                                       fn.concat_ws(" ", df.title, df.summary))

        df_punc_removed = df_news_concat.withColumn(
            "news_content_removed",
            fn.regexp_replace(df_news_concat.news_content, "\p{Punct}", ""))

        udf_remove_unicode = fn.udf(
            lambda x: x.encode("ascii", "ignore").decode("ascii"))
        df_news_content_ascii = df_punc_removed.withColumn(
            "news_content_ascii",
            udf_remove_unicode(df_punc_removed.news_content_removed))

        # insert raw data to the cassandra table
        df_news_content_ascii.select("id", "news_provider", "published", "summary", "title") \
            .write \
            .format("org.apache.spark.sql.cassandra") \
            .mode("append") \
            .options(table="travel_news_data", keyspace="news_stream_analysis") \
            .save(mode="append")

        tokenizer = Tokenizer(inputCol="news_content_ascii",
                              outputCol="content_words")
        df_tokenized_content = tokenizer.transform(df_news_content_ascii).drop(
            "news_content")

        remover = StopWordsRemover(inputCol="content_words",
                                   outputCol="filtered_words")
        stop_words = remover.loadDefaultStopWords("english")
        stop_words.extend([
            '', "travel", "trip", "submitted", "abc", "reditt", "by", "time",
            "timing", "comments", "comment", "thank", "link", "im", "thanks",
            "would", "like", "get", "good", "go", "may", "also", "going",
            "dont", "want", "see", "take", "looking", ""
        ])
        remover.setStopWords(stop_words)
        df_stop_words_removed = remover.transform(df_tokenized_content).drop(
            "content_words")

        cv = CountVectorizer(inputCol="filtered_words",
                             outputCol="rawFeatures")
        cv_model = cv.fit(df_stop_words_removed)
        df_tf_data = cv_model.transform(df_stop_words_removed)
        df_features = df_tf_data.select(
            df_tf_data.rawFeatures.alias("features"))

        def convert_term_indices_to_term(term_indices, vocab):
            terms = []
            for t in term_indices:
                terms.append(vocab[t])

            return str(terms)

        # LDA
        lda = LDA(k=5, maxIter=50, learningOffset=8192.0, learningDecay=0.50)
        model = lda.fit(df_features)
        df_topics = model.describeTopics()

        fn_term_indices_to_term = fn.udf(convert_term_indices_to_term)
        vocab_lit = fn.array(*[fn.lit(k) for k in cv_model.vocabulary])
        df_lda_result = df_topics.withColumn(
            "terms", fn_term_indices_to_term("termIndices", vocab_lit))
        df_lda_result.select("topic", "termIndices",
                             "terms").show(truncate=False)

        df_lda_result.cache()

        lda_terms = df_lda_result.select("terms").collect()
        lda_terms_list = [str(i.terms) for i in lda_terms]

        # based on model terms choose news stories
        for term_list in lda_terms_list:
            s = []
            topic_words = term_list[1:-1].split(",")
            for term in topic_words:
                term = term.split("'")[1]
                s.append(r"(^|\W)" + str(term) + r"($|\W)")
            rx = '|'.join('(?:{0})'.format(x.strip()) for x in s)
            df_results = df_news_content_ascii.filter(
                df_news_content_ascii['news_content_ascii'].rlike(rx))
            df_results = df_results.withColumn("topic_words",
                                               fn.lit(str(topic_words)[1:-1]))
            df_results = df_results.withColumn("results_date",
                                               fn.lit(datetime.datetime.now()))

            # insert results with the raw data to the cassandra table
            df_results.select("id", "news_provider", "published", "summary", "title", "topic_words", "results_date") \
                .write \
                .format("org.apache.spark.sql.cassandra") \
                .mode("append") \
                .options(table="travel_news_data_results", keyspace="news_stream_analysis") \
                .save(mode="append")
Example #21
0
cleanTweets = tweets.withColumn('hashtags',extractEntitiesUDF('text',F.lit(hashtagRegex)))\
  .withColumn('mentions',extractEntitiesUDF('text',F.lit(mentionRegex)))\
  .withColumn('cleanText',cleanTextUDF('text'))

cleanTweets.select('text', 'cleanText', 'hashtags', 'mentions').show(5)

# ## Step 6. Tokenize tweet text
# We now want to take the cleansed tweet text and transform it into an array of tokens. To
# do this, we need to:
# * Tokenize each tweet text
# * Remove any stop words in the text
# * Stem any remaining words in the text

# We will use Spark NLP functions to tokenize and remove stopwords, and NLTK to stem the words
tokenizer = Tokenizer(inputCol="cleanText", outputCol="words")
tokenizedTweets = tokenizer.transform(cleanTweets)

swRemover = StopWordsRemover(inputCol="words", outputCol="nonStopWords")
tokenizedTweetsNoSW = swRemover.transform(tokenizedTweets)


def stem(words):
    stemmer = PorterStemmer()
    stemmed = [stemmer.stem(word) for word in words]
    return stemmed


stemUDF = F.udf(stem, ArrayType(StringType()))
stemmedTweets = tokenizedTweetsNoSW.withColumn('stemmedWords',
                                               stemUDF('nonStopWords'))
Example #22
0
reviews.show()


%pyspark
reviews.createOrReplaceTempView("reviews2")
reviewdf = sqlContext.sql("SELECT CASE WHEN helpful/total_votes> 0.6 THEN 1 ELSE 0 END AS Helpful_Score, reviewText, reviewLength, summaryLength,category,reviewCount, overall FROM reviews2")
reviewdf.show(5)


%pyspark
import pyspark.sql.functions as func
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import StopWordsRemover

tokenizer = Tokenizer(inputCol="reviewText", outputCol="words")
wordsData = tokenizer.transform(reviewdf)

#StopWords Exclution
remover = StopWordsRemover(inputCol="words", outputCol="filtered")
wordsData = remover.transform(wordsData)
wordsData = wordsData.select('filtered', 'summaryLength', 'reviewLength','Helpful_Score','overall','reviewCount')
wordsData.show(5)


%pyspark
#Hash TF on tokenized data
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=50)
featurizedData = hashingTF.transform(wordsData)

#TF-IDF Vectorizer
idf = IDF(inputCol="rawFeatures", outputCol="features")
Example #23
0
def preprocess_files(bucket_name, file_name):

    raw_data = sql_context.read.json("s3a://{0}/{1}".format(
        bucket_name, file_name))

    # Clean article text
    print(colored("[PROCESSING]: Cleaning article text", "green"))
    clean_body = F.udf(lambda body: filter_body(body), StringType())
    clean_article_data = raw_data.withColumn("cleaned_body",
                                             clean_body("text"))
    # Tokenize article text
    print(colored("[PROCESSING]: Tokenizing text vector...", "green"))
    tokenizer = Tokenizer(inputCol="cleaned_body",
                          outputCol="text_body_tokenized")
    tokenized_data = tokenizer.transform(clean_article_data)

    # Remove stop words
    print(colored("[PROCESSING]: Removing stop words", "green"))
    stop_words_remover = StopWordsRemover(
        inputCol="text_body_tokenized",
        outputCol="text_body_stop_words_removed")
    stop_words_removed_data = stop_words_remover.transform(tokenized_data)

    # Stem words
    print(colored("Stemming tokenized text", "green"))
    stem = F.udf(lambda tokens: lemmatize(tokens), ArrayType(StringType()))
    stemmed_data = stop_words_removed_data.withColumn(
        "text_body_stemmed", stem("text_body_stop_words_removed"))

    # Shingle resulting body
    print(colored("Shingling resulting text", "green"))
    shingle = F.udf(lambda tokens: get_n_gram_shingles(tokens, 3),
                    StringType())
    shingled_data = stemmed_data.withColumn("text_body_shingled",
                                            shingle("text_body_stemmed"))
    shingle_table = shingled_data.select('id', 'text_body_shingled')
    print(colored("Adding category/id mappings to Redis", "green"))

    # Create a mapping of article categories to article id's that fall under that category. Each key is an article category and the values the list of article id's.
    cat_id_map = raw_data.select(
        F.explode('categories').alias('category'),
        'id').groupBy(F.col('category')).agg(
            F.collect_list('id').alias('ids_list')).where(
                F.size(F.col('ids_list')) < 200).withColumn(
                    'ids', to_str_udf('ids_list'))
    print(colored("Beginning writing category/id mapping to Redis", "green"))

    def write_cat_id_map_to_redis(rdd):
        rdb = redis.StrictRedis(config.REDIS_SERVER, port=6379, db=0)
        for row in rdd:
            rdb.sadd('cat:{}'.format(row.category), row.ids)

    cat_id_map.foreachPartition(write_cat_id_map_to_redis)
    print(cat_id_map.show(5, True))
    print(colored("Finished writing category/id mapping to Redis", "green"))

    #Minhash calculations
    k = 100
    random_seed = 50
    masks = (np.random.RandomState(seed=random_seed).randint(
        np.iinfo(np.int64).min,
        np.iinfo(np.int64).max, k))

    def update_min_hash_signature(word, min_hash_signature):
        root_hash = mmh3.hash64(pickle.dumps(word))[0]
        word_hashes = np.bitwise_xor(
            masks, root_hash
        )  # XOR root hash with k randomly generated integers to simulate k hash functions
        min_hash_signature = np.minimum(min_hash_signature, word_hashes)
        return min_hash_signature

    def calc_min_hash_signature(tokens):
        min_hash_signature = np.empty(k, dtype=np.int64)
        min_hash_signature.fill(np.iinfo(np.int64).max)
        for token in tokens:
            min_hash_signature = update_min_hash_signature(
                token, min_hash_signature)
        return min_hash_signature

    def compute_minhash(df):
        calc_min_hash_udf = F.udf(
            lambda x: str(
                list(map(lambda x: int(x), calc_min_hash_signature(x)))),
            StringType())
        df = df.withColumn("min_hash",
                           calc_min_hash_udf("text_body_shingled")).select(
                               'id', 'min_hash')
        return df

    print(colored("Computing minhash values", "green"))
    minhash_df = compute_minhash(shingle_table)
    print(colored("Finished computing minhash values", "green"))
    print(colored("Beginning writing minhash data to Redis", "green"))

    # Write minhash data to redis. If pipeline=True, use pipeline
    # method of inserting data in Redis
    def write_minhash_data_to_redis(rdd):
        rdb = redis.StrictRedis(config.REDIS_SERVER, port=6379, db=0)
        for row in rdd:
            rdb.sadd('id:{}'.format(row.id), row.min_hash)

    #print(minhash_df.show(5, True))
    minhash_df.foreachPartition(write_minhash_data_to_redis)

    print(colored("Finished writing minhash data to Redis", "green"))

    print(
        colored("[UPLOAD]: Writing preprocessed data to database...", "green"))
    #    write_aws_s3(config.S3_BUCKET, config.S3_FOLDER_PREPROCESSED, shingled_data)
    cf = configparser.ConfigParser()
    cf.read('../config/db_properties.ini')
Example #24
0
from pyspark.ml.feature import Tokenizer
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("tokenizer_sample") \
    .master("local[*]") \
    .getOrCreate()

data = [(0, "Tokenization is the process"), (1, "Refer to the Tokenizer")]
inputDF = spark.createDataFrame(data).toDF("id", "input")
tokenizer = Tokenizer(inputCol="input", outputCol="output")
outputDF = tokenizer.transform(inputDF)
outputDF.printSchema()
outputDF.show()

spark.stop
Example #25
0
label_message = label_message.withColumn(
    "msg", regexp_replace("msg", "sad|happy", ""))
# union_message.show(100)

# Hasing message
# hashingTF = HashingTF(2000)
# hashingTF = HashingTF()
# hash_message = label_message.rdd.map(lambda row: (hashingTF.transform(row[0]), row[1]))
# hash_message = spark.createDataFrame(hash_message, ["hash_msg", "is_happy"])
# hash_message = hash_message.withColumnRenamed("_1", "hash_msg") \
#     .withColumnRenamed("_2", "is_happy")
# label_message = label_message.withColumn("hash", hashingTF.transform(label_message.msg))
# label_message.show(100)
# hash_message.show()
tokenizer = Tokenizer(inputCol="msg", outputCol="token_msg")
hash_message = tokenizer.transform(label_message)
hasingTF = HashingTF(inputCol="token_msg",
                     outputCol="hash_msg",
                     numFeatures=2000)
hash_message = hasingTF.transform(hash_message)
# hash_message = label_message

# Split messages into training and validation set
label_indexer = StringIndexer(inputCol="is_happy",
                              outputCol="indexed_label").fit(hash_message)
feature_indexer = VectorIndexer(inputCol="hash_msg",
                                outputCol="indexed_hash_msg").fit(hash_message)

validation_set, training_set = hash_message.randomSplit([0.3, 0.7])
validation_set.show()
training_set.show()
# Spark provides rich text analytics capabilities including nGram extraction, TF-IDF, 
# stop words removal, vectorization, and more that can be used to build machine learning
# models based on textual data. 
# ### Sample of Maintenance Logs
maintenance = spark.read.format("com.databricks.spark.csv").option("delimiter", "|")\
  .load("maintenance/maintenance_logs.txt")\
  .withColumnRenamed('_c0','date')\
  .withColumnRenamed('_c1','note')\
  .withColumnRenamed('_c2','duration')\
  .withColumn('note', F.lower(F.regexp_replace('note', '[.!?-]', '')))\
  .select(F.col('date').cast('date'), 'note', F.col('duration').cast('int'))
maintenance.show(5, truncate=False)

# ### Sample of 2-word nGrams on Maintenance Notes
tk = Tokenizer(inputCol="note", outputCol="words") # Tokenize
maintTokenized = tk.transform(maintenance)
swr = StopWordsRemover(inputCol="words", outputCol="filtered") # Remove stop-words
maintFiltered = swr.transform(maintTokenized)
ngram = NGram(n=2, inputCol="filtered", outputCol="ngrams") # 2-word nGrams
maintNGrams = ngram.transform(maintFiltered)
maintNGrams.select('ngrams').show(5, truncate=False)

# ### Topic Clustering using Latent Dirichlet Allocation (LDA)
# LDA is a form of un-supervised machine learning that identifies clusters, or topics,
# in the data
cv = CountVectorizer(inputCol="ngrams", outputCol="features", vocabSize=50)\
  .fit(maintNGrams) # CountVectorize converts nGram array into a vector of counts
maintVectors = cv.transform(maintNGrams)
vocabArray = cv.vocabulary
lda = LDA(k=3, maxIter=10)
ldaModel = lda.fit(maintVectors)
Example #27
0
from pyspark.ml.feature import IDF
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .appName("tf_idf_sample") \
    .master("local[*]") \
    .getOrCreate()

df1 = spark.createDataFrame([(0, "a a a b b c"), (0, "a b c"),
                             (1, "a c a a d")]).toDF("label", "sentence")

tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

# 각 문장을 단어로 분리
df2 = tokenizer.transform(df1)

hashingTF = HashingTF(inputCol="words",
                      outputCol="TF-Features",
                      numFeatures=20)
df3 = hashingTF.transform(df2)

df3.cache()

idf = IDF(inputCol="TF-Features", outputCol="Final-Features")
idfModel = idf.fit(df3)

rescaledData = idfModel.transform(df3)
rescaledData.select("words", "TF-Features", "Final-Features").show()

spark.stop
Example #28
0
def sentimental_analysis(time, rdd):

    rdd = rdd.map(lambda x: json.loads(x[1]))

    text_array = rdd.collect()

    text_array = [
        element["text"].lower() for element in text_array if "text" in element
    ]

    rdd = sc.parallelize(text_array)

    rdd = rdd.map(lambda x: x.replace(',', ' ')).map(
        lambda x: x.replace('/', ' ')).map(lambda x: x.replace('?', ' ')).map(
            lambda x: x.replace('...', ' ')).map(lambda x: x.replace('-', ' '))

    rdd = rdd.map(lambda x: x.replace('.', ' ')).map(
        lambda x: x.replace('(', ' ')).map(lambda x: x.replace(')', ' ')).map(
            lambda x: x.replace('!', ' ')).map(lambda x: x.replace('|', ' '))

    rdd = rdd.map(lambda sn: ' '.join(
        filter(lambda x: x.startswith(
            ('@', 'http', '"', '&', 'rt')) == False, sn.split())))

    tweets_MAGA = rdd.filter(lambda x: "maga" in x).map(lambda x: [x, "MAGA"])

    tweets_DICTATOR = rdd.filter(lambda x: "dictator" in x).map(
        lambda x: [x, "DICTATOR"])

    tweets_IMPEACH = rdd.filter(lambda x: "impeach" in x).map(
        lambda x: [x, "IMPEACH"])

    tweets_DRAIN = rdd.filter(lambda x: "drain" in x).map(
        lambda x: [x, "DRAIN"])

    tweets_SWAMP = rdd.filter(lambda x: "swamp" in x).map(
        lambda x: [x, "SWAMP"])

    tweets_COMEY = rdd.filter(lambda x: "comey" in x).map(
        lambda x: [x, "COMEY"])

    tweets = tweets_DICTATOR.union(tweets_IMPEACH).union(tweets_DRAIN).union(
        tweets_SWAMP).union(tweets_COMEY).union(tweets_MAGA)

    set_tweets = tweets.map(
        lambda x: Row(sentence=str.strip(x[0]), label=x[1], date_time=time))

    spark = getSparkSessionInstance(rdd.context.getConf())

    partsDF = spark.createDataFrame(set_tweets)

    tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

    tokenized = tokenizer.transform(partsDF)

    remover = StopWordsRemover(
        inputCol="words",
        outputCol="base_words")  #define parameter of StopWordsRemover funtion

    base_words = remover.transform(tokenized)

    train_data_row = base_words.select("base_words", "label", "date_time")

    word2vec = Word2Vec(vectorSize=3,
                        minCount=0,
                        inputCol="base_words",
                        outputCol="features")

    model = word2vec.fit(train_data_row)

    final_train_data = model.transform(train_data_row)

    resul_analysis = classifier.transform(final_train_data)

    resul_analysis = resul_analysis.select("label", "date_time", "prediction")

    resul_analysis.createOrReplaceTempView("sentimental_analysis")

    resul_analysisDF = spark.sql(
        "select label, date_time, prediction, count(*) as total_label from sentimental_analysis group by label, date_time, prediction order by total_label"
    )

    resul_analysisDF.write.mode("append").saveAsTable("sentimental_analysis")
Example #29
0
def training_data():

    rdd = sc.textFile("/user/sentimental_analysis/Subset100k.csv")

    #header = data.first()

    #rdd = data.filter(lambda row: row != header)

    r = rdd.mapPartitions(lambda x: csv.reader(x))

    #r2 = r.map(lambda x: (x[3], int(x[1])))

    part = r.map(lambda x: Row(sentence=str.strip(x[3]), label=int(x[1]))
                 )  #put a schema and make data Frame

    spark = getSparkSessionInstance(rdd.context.getConf())

    partsDF = spark.createDataFrame(part)

    #partDF.show()

    tokenizer = Tokenizer(
        inputCol="sentence",
        outputCol="words")  #define parameter of Tokenizer funtion

    tokenized = tokenizer.transform(
        partsDF)  # tokenizer split the sentences by row

    #tokenized.show()

    remover = StopWordsRemover(
        inputCol="words",
        outputCol="base_words")  #define parameter of StopWordsRemover funtion

    base_words = remover.transform(tokenized)

    #base_words.show()

    train_data_row = base_words.select("base_words", "label")

    word2vec = Word2Vec(vectorSize=3,
                        minCount=0,
                        inputCol="base_words",
                        outputCol="features")

    model = word2vec.fit(train_data_row)

    final_train_data = model.transform(train_data_row)

    #final_train_data.show()

    final_train_data = final_train_data.select("label", "features")

    lr = LogisticRegression(maxIter=10000,
                            regParam=0.001,
                            elasticNetParam=0.0001)

    lrModel = lr.fit(final_train_data)

    #lrModel.transform(final_train_data).show()

    return lrModel
    #------------------------------------------------ Validacion Data2 ---------------------------------------#

    rdd = sc.textFile("/user/sentimental_analysis/Subset100k.csv")

    r = rdd.mapPartitions(lambda x: csv.reader(x))

    part = r.map(lambda x: Row(sentence=str.strip(x[3]), label=int(x[1])))

    spark = getSparkSessionInstance(rdd.context.getConf())

    partsDF = spark.createDataFrame(part)

    tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

    tokenized = tokenizer.transform(partsDF)

    remover = StopWordsRemover(
        inputCol="words",
        outputCol="base_words")  #define parameter of StopWordsRemover funtion

    base_words = remover.transform(tokenized)

    train_data_row = base_words.select("base_words", "label")

    word2vec = Word2Vec(vectorSize=3,
                        minCount=0,
                        inputCol="base_words",
                        outputCol="features")

    model = word2vec.fit(train_data_row)

    final_train_data = model.transform(train_data_row)

    final_train_data = final_train_data.select("label", "features")

    #lrModel.transform(final_train_data).show()

    print(
        "********************************** Todo bajo control ***********************************"
    )
Example #30
0
    .appName("SimpleApplication") \
    .getOrCreate()

print("Загружаем данные...")
input_data = spark.sparkContext.wholeTextFiles(PATH)

print("Готовим данные...")
prepared_data = input_data.map(lambda x: (get_patent_name(x[1]), get_claims(x[1]))) \
    .map(lambda x: (x[0], remove_punctuation(x[1]))) \
    .map(lambda x: (x[0], remove_linebreaks(x[1])))

prepared_df = prepared_data.toDF().selectExpr('_1 as patent_name', '_2 as patent_claims')

print("Разбиваем текст на токены...")
tokenizer = Tokenizer(inputCol="patent_claims", outputCol="words")
words_data = tokenizer.transform(prepared_df)

print("Фильтруем токены...")
filtered_words_data = words_data.rdd.map(lambda x: (x[0], x[1], get_only_words(x[2])))
filtered_df = filtered_words_data.toDF().selectExpr('_1 as patent_name', '_2 as patent_claims', '_3 as words')

print("Удаляем стоп-слова...")
remover = StopWordsRemover(inputCol='words', outputCol='filtered')
filtered = remover.transform(filtered_df)

print("Считаем признаки...")
vectorizer = CountVectorizer(inputCol='filtered', outputCol='raw_features').fit(filtered)
featurized_data = vectorizer.transform(filtered)
featurized_data.cache()

print("Считаем относительные частоты признаков...")
    .appName("e8_3") \
    .config("spark.some.config.option", "some-value") \
    .getOrCreate()
sc = spark.sparkContext
sqlContext = SQLContext(sc)

spamFile = sc.textFile("Spam.csv")
fileLineTokens = spamFile.map(lambda line:
                              (line.split(",", 1)[0], line.split(",", 1)[1]))
fileDf = sqlContext.createDataFrame(fileLineTokens, ["categ", "text"])

training_data, testing_data = fileDf.randomSplit([0.8, 0.2])

categoryIndexerUnit = StringIndexer(inputCol="categ", outputCol="label")
tokenizerUnit = Tokenizer(inputCol="text", outputCol="words")
tokenizedWords = tokenizerUnit.transform(fileDf)
stoRemover = StopWordsRemover(inputCol="words", outputCol="clean_content")
tweetWordsData = stoRemover.transform(tokenizedWords)
hashingTermFrequecies = HashingTF(inputCol="clean_content",
                                  outputCol="rawFeatures",
                                  numFeatures=500)
featuredData = hashingTermFrequecies.transform(tweetWordsData)
idfData = IDF(inputCol="rawFeatures", outputCol="features")
naiveBaysClassifier = NaiveBayes(smoothing=1.0, modelType="multinomial")

# here I didn't use idf model because there is only one main doc and through hashing tf naive bays can idetify between sapm or not
pipeline = Pipeline(stages=[
    categoryIndexerUnit, tokenizerUnit, stoRemover, hashingTermFrequecies,
    idfData, naiveBaysClassifier
])
    
    #normalized = " ".join(lemma.lemmatize(word,'v') for word in words)
    text_out = " ".join(lemma.lemmatize(word,'v') for word in text_out)
 
    
    return text_out


udf_cleantext = udf(cleanup_text, StringType())

clean_text = data.withColumn("clean_comm", udf_cleantext(data.comment_text))
#clean_text.select("clean_comm").show(3)

tokenizer = Tokenizer(inputCol="clean_comm", outputCol="tokens")

tokenized = tokenizer.transform(clean_text)
e = tokenized.select("clean_comm", "tokens")
#tokenized.select("clean_comm", "tokens").show(1)

remover = StopWordsRemover(inputCol="tokens", outputCol="tokens_Stop")

t = remover.transform(e)
tt = t.select('tokens_Stop')
#tt.show(2)

tt = tt.select('tokens_Stop')
tt = tt.limit(650000) #you can choose number or comments you want to run LDA on
tt.count()
#tt.toPandas().to_csv('cleaning.csv')  #Uncomment if oyu want to save a cleaned model

spark_train = spark_train.filter(~isnull('reflection_period'))
spark_train = spark_train.filter(~isnull('cleaned_hm'))
spark_train = spark_train.filter(~isnull('num_sentence'))
spark_train = spark_train.filter(~isnull('predicted_category'))

#For mapping labels
prediction_scores = spark_train.groupBy("predicted_category").count().orderBy(col("count").desc())
pd_df_train =prediction_scores.toPandas()
pd_df_train['predict_score'] = np.arange(len(pd_df_train))
spark_df = spark.createDataFrame(pd_df_train)
spark_df = spark_df.drop('count')
spark_df = spark_df.selectExpr("predicted_category as predicted_category_table", "predict_score as predict_score")

#Tokenizing and Vectorizing
tok = Tokenizer(inputCol="cleaned_hm", outputCol="words")
review_tokenized = tok.transform(spark_train)

stopword_rm = StopWordsRemover(inputCol='words', outputCol='words_nsw')
review_tokenized = stopword_rm.transform(review_tokenized)

cv = CountVectorizer(inputCol='words_nsw', outputCol='tf')
cvModel = cv.fit(review_tokenized)
count_vectorized = cvModel.transform(review_tokenized)

idf_ngram = IDF().setInputCol('tf').setOutputCol('tfidf')
tfidfModel_ngram = idf_ngram.fit(count_vectorized)
tfidf_df = tfidfModel_ngram.transform(count_vectorized)

word_indexer_pc = StringIndexer(inputCol="predicted_category", outputCol="predicted_category_new", handleInvalid="error")

#Splitting the training data into training data and validation data
Example #34
0
                          outputCol="originalCategory")
converted = converter.transform(dataIndexed)

converted.show()

labelLookup = converted.dropDuplicates(['indexedLabel', 'originalCategory'
                                        ]).select('indexedLabel',
                                                  'originalCategory')

# something

tokenizer = Tokenizer(inputCol="value", outputCol="words")
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                           outputCol="filtered")

tokenized = tokenizer.transform(converted)
removed = remover.transform(tokenized)

tokenized.show()
removed.show()

# Extracting Features

hashingTF = HashingTF(inputCol=remover.getOutputCol(),
                      outputCol="features",
                      numFeatures=20)
hashed = hashingTF.transform(removed)

hashed.show()

# Making Labeled Point
Example #35
0
              {'field': 'manufacturer', 'type': 'String'},
              {'field': 'model', 'type': 'String'},
              {'field': 'family', 'type': 'String'},
              ]
    gazetteer = Gazetteer(fields)
    # read in listings from json file
    # specifying fields makes the parsing more efficient in Spark
    listing_fields = [StructField("title", StringType(), True),
                      StructField("manufacturer", StringType(), True),
                      StructField("currency", StringType(), True),
                      StructField("price", StringType(), True),
                    ]
    listings = sqlContext.read.json(LISTINGS_PATH, StructType(listing_fields)).distinct()
    # break listing title into words
    tokenizer = Tokenizer(inputCol="title", outputCol="words")
    listings = tokenizer.transform(listings)
    # read in products from json file
    product_fields = [StructField("product_name", StringType(), True),
                      StructField("manufacturer", StringType(), True),
                      StructField("family", StringType(), True),
                      StructField("model", StringType(), True),
                    ]

    products = sqlContext.read.json(PRODUCTS_PATH, StructType(product_fields))\
                                .fillna({'family': ''}) # replace nulls in family fields

    products_df, products_dict = canonical_format(products, Product)
    listings_df, listings_dict = canonical_format(listings, Listing)

    products_training_dict = json.load(open(PRODUCTS_TRAINING_PATH))
    listings_training_dict = json.load(open(LISTINGS_TRAINING_PATH))
def transform(spark, s3_input_data, s3_output_train_data,
              s3_output_validation_data, s3_output_test_data):
    print('Processing {} => {}'.format(s3_input_data, s3_output_train_data,
                                       s3_output_validation_data,
                                       s3_output_test_data))

    schema = StructType([
        #        StructField('is_positive_sentiment', IntegerType(), True),
        StructField('marketplace', StringType(), True),
        StructField('customer_id', StringType(), True),
        StructField('review_id', StringType(), True),
        StructField('product_id', StringType(), True),
        StructField('product_parent', StringType(), True),
        StructField('product_title', StringType(), True),
        StructField('product_category', StringType(), True),
        StructField('star_rating', IntegerType(), True),
        StructField('helpful_votes', IntegerType(), True),
        StructField('total_votes', IntegerType(), True),
        StructField('vine', StringType(), True),
        StructField('verified_purchase', StringType(), True),
        StructField('review_headline', StringType(), True),
        StructField('review_body', StringType(), True),
        StructField('review_date', StringType(), True)
    ])

    df_csv = spark.read.csv(path=s3_input_data,
                            sep='\t',
                            schema=schema,
                            header=True,
                            quote=None)
    df_csv.show()

    # This dataset should already be clean, but always good to double-check
    print('Showing null review_body rows...')
    df_csv.where(col('review_body').isNull()).show()

    df_csv_cleaned = df_csv.na.drop(subset=['review_body'])
    df_csv_cleaned.where(col('review_body').isNull()).show()

    tokenizer = Tokenizer(inputCol='review_body', outputCol='words')
    wordsData = tokenizer.transform(df_csv_cleaned)

    hashingTF = HashingTF(inputCol='words',
                          outputCol='raw_features',
                          numFeatures=1000)
    featurizedData = hashingTF.transform(wordsData)

    # While applying HashingTF only needs a single pass to the data, applying IDF needs two passes:
    # 1) compute the IDF vector
    # 2) scale the term frequencies by IDF
    # Therefore, we cache the result of the HashingTF transformation above to speed up the 2nd pass
    featurizedData.cache()

    # spark.mllib's IDF implementation provides an option for ignoring terms
    # which occur in less than a minimum number of documents.
    # In such cases, the IDF for these terms is set to 0.
    # This feature can be used by passing the minDocFreq value to the IDF constructor.
    idf = IDF(inputCol='raw_features', outputCol='features')  #, minDocFreq=2)
    idfModel = idf.fit(featurizedData)
    features_df = idfModel.transform(featurizedData)
    features_df.select('star_rating', 'features').show()

    # TODO:  Use SVD instead
    # features_vector_rdd = features_df.select('features').rdd.map( lambda row: Vectors.fromML(row.getAs[MLVector]('features') )
    # features_vector_rdd.cache()
    # mat = RowMatrix(features_vector_rdd)
    # k = 300
    # svd = mat.computeSVD(k, computeU=True)
    # TODO:  Reconstruct

    num_features = 300
    pca = PCA(k=num_features, inputCol='features', outputCol='pca_features')
    pca_model = pca.fit(features_df)
    pca_features_df = pca_model.transform(features_df).select(
        'star_rating', 'pca_features')
    pca_features_df.show(truncate=False)

    standard_scaler = StandardScaler(inputCol='pca_features',
                                     outputCol='scaled_pca_features')
    standard_scaler_model = standard_scaler.fit(pca_features_df)
    standard_scaler_features_df = standard_scaler_model.transform(
        pca_features_df).select('star_rating', 'scaled_pca_features')
    standard_scaler_features_df.show(truncate=False)

    expanded_features_df = (standard_scaler_features_df.withColumn(
        'f', to_array(col('scaled_pca_features'))).select(
            ['star_rating'] + [col('f')[i] for i in range(num_features)]))
    expanded_features_df.show()

    train_df, validation_df, test_df = expanded_features_df.randomSplit(
        [0.9, 0.05, 0.05])

    # Removed overwrite to test for this issue
    #    https://stackoverflow.com/questions/51050591/spark-throws-java-io-ioexception-failed-to-rename-when-saving-part-xxxxx-gz
    train_df.write.csv(path=s3_output_train_data, header=None, quote=None)  #,
    #                       mode='overwrite')
    print('Wrote to output file:  {}'.format(s3_output_train_data))

    # Removed overwrite to test for this issue
    #    https://stackoverflow.com/questions/51050591/spark-throws-java-io-ioexception-failed-to-rename-when-saving-part-xxxxx-gz
    validation_df.write.csv(path=s3_output_validation_data,
                            header=None,
                            quote=None)  #,
    #                            mode='overwrite')
    print('Wrote to output file:  {}'.format(s3_output_validation_data))

    # Removed overwrite to test for this issue
    #    https://stackoverflow.com/questions/51050591/spark-throws-java-io-ioexception-failed-to-rename-when-saving-part-xxxxx-gz
    test_df.write.csv(path=s3_output_test_data, header=None, quote=None)  #,
    #                       mode='overwrite')
    print('Wrote to output file:  {}'.format(s3_output_test_data))
Example #37
0
def makeWord2VecModel():
    cursor = News.find({})
    text = ""
    for news in cursor:
        text += news['text']
    with open(os.path.join(os.getcwd(), 'word2Vec.txt'), 'w',
              encoding='utf-8') as inputFile:
        inputFile.writelines(text)
    spark = SparkSession.builder.appName("SimpleApplication").getOrCreate()

    # Построчная загрузка файла в RDD
    input_file = spark.sparkContext.textFile('word2Vec.txt')

    print(input_file.collect())
    prepared = input_file.map(lambda x: ([x]))
    df = prepared.toDF()
    prepared_df = df.selectExpr('_1 as text')

    # Разбить на токены
    tokenizer = Tokenizer(inputCol='text', outputCol='words')
    words = tokenizer.transform(prepared_df)

    # Удалить стоп-слова
    stop_words = StopWordsRemover.loadDefaultStopWords('russian')
    remover = StopWordsRemover(inputCol='words',
                               outputCol='filtered',
                               stopWords=stop_words)
    filtered = remover.transform(words)

    # Вывести стоп-слова для русского языка
    print(stop_words)

    # Вывести таблицу filtered
    filtered.show()

    # Вывести столбец таблицы words с токенами до удаления стоп-слов
    words.select('words').show(truncate=False, vertical=True)

    # Вывести столбец "filtered" таблицы filtered с токенами после удаления стоп-слов
    filtered.select('filtered').show(truncate=False, vertical=True)

    # Посчитать значения TF
    vectorizer = CountVectorizer(inputCol='filtered',
                                 outputCol='raw_features').fit(filtered)
    featurized_data = vectorizer.transform(filtered)
    featurized_data.cache()
    vocabulary = vectorizer.vocabulary

    # Вывести таблицу со значениями частоты встречаемости термов.
    featurized_data.show()

    # Вывести столбец "raw_features" таблицы featurized_data
    featurized_data.select('raw_features').show(truncate=False, vertical=True)

    # Вывести список термов в словаре
    print(vocabulary)

    # Посчитать значения DF
    idf = IDF(inputCol='raw_features', outputCol='features')
    idf_model = idf.fit(featurized_data)
    rescaled_data = idf_model.transform(featurized_data)

    # Вывести таблицу rescaled_data
    rescaled_data.show()

    # Вывести столбец "features" таблицы featurized_data
    rescaled_data.select('features').show(truncate=False, vertical=True)

    # Построить модель Word2Vec
    word2Vec = Word2Vec(vectorSize=3,
                        minCount=0,
                        inputCol='words',
                        outputCol='result')
    model = word2Vec.fit(words)
    w2v_df = model.transform(words)
    w2v_df.show()
    persons = []

    cPersons = db.Persones.find({})
    for secName in cPersons:
        persons.append(secName['sName'])

    synonyms = []
    i = 0
    synonyms.append(model.findSynonyms('погибла', 2))

    for word, cosine_distance in synonyms:
        print(str(word))

    spark.stop()
Example #38
0
sqlContext = SQLContext(sc)

pdDF = pd.read_csv('Megadados-Projeto2/lyrics.csv')

mySchema = StructType([ StructField("index", LongType(), True)\
                       ,StructField("song", StringType(), True)\
                       ,StructField("year", IntegerType(), True)\
                       ,StructField("artist", StringType(), True)\
                       ,StructField("genre", StringType(), True)\
                       ,StructField("lyrics", StringType(), True)])

df = sqlContext.createDataFrame(pdDF, schema=mySchema)

tokenizer = Tokenizer(inputCol="lyrics", outputCol="words")
wordsDataFrame = tokenizer.transform(df)

#remove 20 most occuring documents, documents with non numeric characters, and documents with <= 3 characters
cv_tmp = CountVectorizer(inputCol="words", outputCol="tmp_vectors")
cv_tmp_model = cv_tmp.fit(wordsDataFrame)

top20 = list(cv_tmp_model.vocabulary[0:20])
more_then_3_charachters = [
    word for word in cv_tmp_model.vocabulary if len(word) <= 3
]
contains_digits = [
    word for word in cv_tmp_model.vocabulary
    if any(char.isdigit() for char in word)
]

stopwords = []  #Add additional stopwords in this list
 def preprocess_tweets(tweets):
     tokenizer = Tokenizer(inputCol="text", outputCol="words")
     tweets = tokenizer.transform(tweets)
     remover = StopWordsRemover(inputCol="words", outputCol="filtered")
     tweets = remover.transform(tweets)
     return tweets
                           inferSchema=True,
                           escape='"',
                           multiLine=True)

#filter all null values in company dataset
companies = companies.filter(companies.description.isNotNull())
companies = companies.filter(companies.industry.isNotNull())

#join the two datasets
joined_df = companies.join(all_data,
                           companies['company name'] == all_data.company)
joined_df.show()

#generate tokenizer
tokenizer = Tokenizer(inputCol='position', outputCol='token')
all_data = tokenizer.transform(all_data)

#generate ngrams
ngram = NGram(n=2, inputCol="token", outputCol="ngrams")
all_data = ngram.transform(all_data)

#explode, split , group and count
all_data.select(['ngrams',
                 'location']).select('location',
                                     F.explode('ngrams').alias('ngrams'))
cities = all_data.select(['ngrams', 'location']).select(
    F.explode('ngrams').alias('ngrams'),
    F.split(all_data['location'], ',')[0].alias('city'))
cities.groupBy(['ngrams', 'city']).count().orderBy("count",
                                                   ascending=False).show()
Example #41
0
        return row_with_index(*[uid] + [row_dict.get(c) for c in columns])

    return _make_row

f = make_row(df.columns)

indexed = (df.rdd
           .zipWithUniqueId()
           .map(lambda x: f(*x))
           .toDF(StructType([StructField("id", LongType(), False)] + df.schema.fields)))
           
## tokenizing the reviews, removing stopwords, stemming and storing the results in a dataframe

# tokenize
tokenizer = Tokenizer(inputCol="text", outputCol="tokens")
tokenized = tokenizer.transform(indexed)
print 'done'

# remove stop words
stopwordList = ['','get','got','also','really','would','one','good','like','great','tri','love','two','three','took','awesome',
 'me','bad','horrible','disgusting','terrible','fabulous','amazing','terrific','worst','best','fine','excellent','acceptable',
 'my','exceptional','satisfactory','satisfying','super','awful','atrocious','unacceptable','poor','sad','gross','authentic',
 'myself','cheap','expensive','we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him',
 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what',
 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has',
 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of',
 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to',
 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where',
 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same',
 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain',
 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven', 'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn',
Example #42
0
    # and return the result.
    return " ".join(meaningful_words)


stops = set(stopwords.words("english"))
lines = sc.textFile("s3://spark-project-data/unlabeledTrainData.tsv")
rows = lines.zipWithIndex().filter(lambda (row, index): index > 0).keys()
parts = rows.map(lambda l: l.split("\t"))

review = parts.map(
    lambda p: Row(id=p[0], label=float(p[1]), review=review_to_words(p[2])))

schemeReview = sqlContext.createDataFrame(review)

tokenizer = Tokenizer(inputCol="review", outputCol="words")
wordsData = tokenizer.transform(schemeReview)
hashingTF = HashingTF(inputCol="words",
                      outputCol="rawFeatures",
                      numFeatures=300)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
selectData = rescaledData.select("label", "features")

lp = selectData.map(lambda x: LabeledPoint(x.label, x.features))

(trainingData, testData) = lp.randomSplit([0.6, 0.4])

nb = NaiveBayes.train(trainingData, 1.0)
Example #43
0
parts = lines.map(lambda l: l.split(","))
f = parts.map(lambda p: Row(tindex=int(p[0]),packageName=p[1],packagePermissions=p[2], label= int(float(p[3])),training=1))


linest = sc.textFile("/Users/admin/Desktop/KBSApp/KBSApp/permissionsData/dataSets/SVMDataGroundTruth.txt")
partst = linest.map(lambda l: l.split(","))
ft = partst.map(lambda p: Row(tindex=int(p[0]),packageName=p[1],packagePermissions=p[2],label= int(float(p[3])),training=0))
alldata = f.union(ft)

schemaApp = sqlContext.createDataFrame(alldata)

schemaApp.registerTempTable("data")

tokenizer = Tokenizer(inputCol="packagePermissions", outputCol="perms")
permsData = tokenizer.transform(schemaApp)

hashingTF = HashingTF(inputCol="perms", outputCol="rawFeatures")
featurizedData = hashingTF.transform(permsData)


idf = IDF(inputCol="rawFeatures", outputCol="features")


idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

wordsvectors = rescaledData["label","features"].map(lambda row: LabeledPoint(row[0], row[1]))
model = LogisticRegressionWithLBFGS.train(wordsvectors, iterations=100)

labelsAndPreds = wordsvectors.map(lambda p: (p.label, model.predict(p.features)))

from pyspark.sql import SQLContext 
def preProcess(doc):
    clean = doc.replace("<br /><br />"," ")
    return clean.lower()
rdd = labeledRdd.map(lambda doc : (preProcess(doc[0]),doc[1]))

sqlContext = SQLContext(sc)

df = sqlContext.createDataFrame(rdd, ['review', 'label'])
dfTrain, dfTest = df.randomSplit([0.8,0.2])

from pyspark.ml.feature import Tokenizer
tokenizer = Tokenizer(inputCol='review', outputCol='words')
dfTrainTok = tokenizer.transform(dfTrain)

import itertools
lists=dfTrainTok.map(lambda r : r.review).collect()
dictWords=set(itertools.chain(*lists))
dictionaryWords={}
for i,word in enumerate(dictWords):
	dictionaryWords[word]=i

dict_broad=sc.broadcast(dictionaryWords)

from pyspark.mllib.linalg import SparseVector
def vectorize(row,dico):
    vector_dict={}
    for w in row.words:
        if w in dico:
Example #45
0
f = indexedTweets.map(lambda p: Row(tindex=int(p[1]),tweet=p[0][0], label= int(float(p[0][1])), training=1))
#f = parts.map(lambda p: Row(tweet=p[0],label=int(p[1])))

linest = sc.textFile("/home/ankita/MLProject/SVM/GroundTruth.txt")

partst = linest.map(lambda l: l.split(","))
indexedTweetst = partst.zipWithIndex().map(lambda (a,b): (a,b+trainingCount))
ft = indexedTweetst.map(lambda p: Row(tindex=int(p[1]),tweet=p[0][1], label= int(float(p[0][0])),training=0))
alldata = f.union(ft)

schemaTweets = sqlContext.createDataFrame(alldata)

schemaTweets.registerTempTable("data")

tokenizer = Tokenizer(inputCol="tweet", outputCol="words")
wordsData = tokenizer.transform(schemaTweets)

hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures")
featurizedData = hashingTF.transform(wordsData)


idf = IDF(inputCol="rawFeatures", outputCol="features")



idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
#rescaledData.collect()
wordsvectors = rescaledData.filter(rescaledData.training==1)["label","features"].map(lambda row: LabeledPoint(row[0], row[1]))
model = LogisticRegressionWithLBFGS.train(wordsvectors, iterations=100)
    review_text = BeautifulSoup(raw_review).text
    #
    # 2. Remove non-letters        
    letters_only = re.sub("[^a-zA-Z]", " ", review_text) 
    #
    # 3. Convert to lower case, split into individual words
    words = letters_only.lower().split()                                            
    # 
    # 4. Remove stop words
    meaningful_words =  [w for w in words if not w in stops]   
    #
    # 5. Join the words back into one string separated by space, 
    # and return the result.
    return " ".join( meaningful_words)   

stops = set(stopwords.words("english")) 
lines = sc.textFile("s3://spark-project-data/labeledTrainData.tsv")
rows = lines.zipWithIndex().filter(lambda (row,index): index > 0).keys()
parts = rows.map(lambda l: l.split("\t"))

review = parts.map(lambda p: Row(id=p[0], label=float(p[1]), 
	review=review_to_words(p[2])))
schemeReview = sqlContext.createDataFrame(review)
tokenizer = Tokenizer(inputCol="review", outputCol="words")
wordsData = tokenizer.transform(schemeReview)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=300)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
selectData = rescaledData.select("label","features")
Example #47
0
	def trainModel(self):
		
		logger.info("Training the model...")		

		query = '''select page_id, max(page_title) as page_title from cooladata where date_range(all) and page_id is not null group by page_id;'''

		def SQLtoURL(query):
    
    			data = query.replace('\n', ' ').replace('\t',' ').replace('   ',' ').replace('  ',' ')
    			return data


		def QueryXXXXX(query, file = None):
   
    			session = Session()
    			response = session.post(data = {'tq': query,}, url = 'https://app.XXXXXX.com/api/v2/projects/115659/cql/', headers = {'Authorization': 'Token dtQvPVejNcSebX1EkU0AqB2TJRXznIgZiDvDu3HR'},)
    			return response.content

		table = json.loads(codecs.decode(QueryCoola(SQLtoURL(query)),'utf-8'))['table']
		title_list = [x['c'] for x in table['rows']]
		table_cols = [d['label'] for d in table['cols']]  
		def convert_row(row):
    			rowlist = [d['v'] for d in row]
    			return rowlist

		rd = self.sc.parallelize(title_list).map(convert_row)
		titleData = self.spark.createDataFrame(rd, table_cols)
		titleData = titleData.dropna()
		
		hebrew_stopwords = stop_words()
		def rmv(words):
    			for punc in punctuation:
        			words = words.replace(punc,"")
    			for hword in hebrew_stopwords:
        			words = words.replace(hword, " ")
    			return words

		self.spark.udf.register("rmv", rmv, StringType())
		titleData.registerTempTable("wordstable")
		cleanedSentenceData = self.spark.sql("select page_id, page_title, rmv(page_title) as cleanedSentence from wordstable")
		tokenizer = Tokenizer(inputCol="cleanedSentence", outputCol="words")
		wordsData = tokenizer.transform(cleanedSentenceData)

		cv = CountVectorizer(inputCol="words", outputCol="rawFeatures", minDF = 2.0)
		cvModel = cv.fit(wordsData)
		featurizedData = cvModel.transform(wordsData)

		idf = IDF(inputCol="rawFeatures", outputCol="features")
		idfModel = idf.fit(featurizedData)
		rescaledData = idfModel.transform(featurizedData)

		lda = LDA(k=100)
		ldaModel = lda.fit(rescaledData)
		postFactorizedData = ldaModel.transform(rescaledData)

		norm = Normalizer(inputCol = "topicDistribution", outputCol="normTopicDist")
		scaledFactorizedNormalizedData = norm.transform(postFactorizedData)
		
		self.model = scaledFactorizedNormalizedData
		
		logger.info("model is built!")
Example #48
0
# limitations under the License.
#

from __future__ import print_function

from pyspark import SparkContext
from pyspark.sql import SQLContext
# $example on$
from pyspark.ml.feature import Tokenizer, RegexTokenizer
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="TokenizerExample")
    sqlContext = SQLContext(sc)

    # $example on$
    sentenceDataFrame = sqlContext.createDataFrame([
        (0, "Hi I heard about Spark"),
        (1, "I wish Java could use case classes"),
        (2, "Logistic,regression,models,are,neat")
    ], ["label", "sentence"])
    tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
    wordsDataFrame = tokenizer.transform(sentenceDataFrame)
    for words_label in wordsDataFrame.select("words", "label").take(3):
        print(words_label)
    regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W")
    # alternatively, pattern="\\w+", gaps(False)
    # $example off$

    sc.stop()
Example #49
0
#LOADING DATA FROM HDFS TO SPARK DATAFRAME
df0=spark.read.option("sep", "\t").option('header',True).csv('hdfs://192.168.50.93:9000/user/hadoop/books2/amazon_reviews_us_Wireless_v1_00.tsv')
df0.printSchema()

#FILTERING FOR EMPTY VALUES
df01 = df0.filter((col("review_body").isNotNull()) & (col("verified_purchase").isNotNull()))

#ENCODING LABEL
stage_string = StringIndexer(inputCol="verified_purchase", outputCol="class_res")
ppl = Pipeline(stages=[stage_string])
df1 = ppl.fit(df01).transform(df01)

#CREATING TF_IDF
tokenizer = Tokenizer(inputCol="review_body", outputCol="words")
wordsData = tokenizer.transform(df1)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

#NAIVEBAYES 
nb = NaiveBayes(featuresCol="features", labelCol="class_res")

#Model training
model = nb.fit(rescaledData)

#Model Saving
model.write().overwrite().save("./NB_model")
Example #50
0
print "Create dataframe"
t0 = time()
df = sqlContext.createDataFrame(rdd, ['review', 'label'])
print "Showing first example : "
print
print df.first()
tt = time() - t0
print
print "Dataframe created in {} second".format(round(tt,3))


# In[314]:

from pyspark.ml.feature import Tokenizer
tokenizer = Tokenizer(inputCol='review', outputCol='words')
dfTok = tokenizer.transform(df)


# In[315]:

from pyspark.ml.feature import NGram
bigram = NGram(inputCol="words", outputCol="bigrams")
dfBigram = bigram.transform(dfTok)


# In[317]:

print "Start tokenizing, computing bigrams and splitting between test and train"
t0 = time()
dfTrain, dfTest = dfBigram.randomSplit([0.8,0.2])
dfTrain.take(1)
Example #51
0
data = df.rdd.map(list)
print(data.first())

score = data.map(lambda s: 1.0
                 if s[1].isdigit() and float(s[1]) == 1.0 else 0.0)
comment = data.map(lambda s: s[3])
#words = comment.map(lambda w:"/".join(jieba.cut_for_search(w))).map(lambda line:line.split("/"))
split_neg_data2 = score.zip(comment)
tranform_data = split_neg_data2.map(
    lambda p: (p[0], p[1]))  #.toDF()#.withColumnRenamed('_1','label')
#tranform_data.show()
#sentenceData = spark.createDataFrame([(0, "I heard about Spark and I love Spark"),(0, "I wish Java could use case classes"),(1, "Logistic regression models are neat")]).toDF("label", "sentence")

sentenceData = spark.createDataFrame(tranform_data, ["label", "sentence"])
tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
wordsData = tokenizer.transform(sentenceData)

#计算TF-IDF
hashingTF = HashingTF(inputCol="words",
                      outputCol="rawFeatures",
                      numFeatures=3000)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)

forData = StringIndexer().setInputCol("label").setOutputCol("indexed").fit(
    rescaledData).transform(rescaledData)
(trainingData, testData) = forData.randomSplit([0.8, 0.2], seed=0)
print(trainingData.take(1))
rfClassifier = RandomForestClassifier(numTrees=10,
Example #52
0
print "################"

#Step 0:clean the data

print "check the types"
#print fulldata.dtypes
print "################"
print "CLEANING Data:"
fulldata["product_title_clean"] = fulldata["product_title"].apply(cf.rmP)
#print fulldata["product_title_clean"].head()
#TF-IDF features

#Step 1: split text field into words
print "STEP 1################"
tokenizer = Tokenizer(inputCol="product_title_clean", outputCol="words_title")
fulldata = tokenizer.transform(fulldata)
print "Tokenized Title:"
print fulldata.head()
print "################"
#Step 2: compute term frequencies
hashingTF = HashingTF(inputCol="words_title", outputCol="tf")
fulldata = hashingTF.transform(fulldata)
print "TERM frequencies:"
print fulldata.head()
print "################"
#Step 3: compute inverse document frequencies
idf = IDF(inputCol="tf", outputCol="tf_idf")
idfModel = idf.fit(fulldata)
fulldata = idfModel.transform(fulldata)
print "IDF :"
print fulldata.head()
Example #53
0
if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("TfIdfExample")\
        .getOrCreate()

    # $example on$
    sentenceData = spark.createDataFrame([
        (0, "Hi I heard about Spark"),
        (0, "I wish Java could use case classes"),
        (1, "Logistic regression models are neat")
    ], ["label", "sentence"])

    tokenizer = Tokenizer(inputCol="sentence", outputCol="words")
    wordsData = tokenizer.transform(sentenceData)

    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    # alternatively, CountVectorizer can also be used to get term frequency vectors

    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(featurizedData)
    rescaledData = idfModel.transform(featurizedData)

    for features_label in rescaledData.select("features", "label").take(3):
        print(features_label)
    # $example off$

    spark.stop()
Example #54
0
airportCleanDF = airportCleanDF.withColumn("airport_staff_ratingf", fn.col("airport_staff_rating").cast("float"))
airportCleanDF = airportCleanDF.withColumn("recommendedi", fn.col("recommended").cast("integer"))

airportCleanDF = reduce(DataFrame.drop, ['overall_rating','queuing_rating', 'terminal_cleanliness_rating', 'terminal_seating_rating', 'terminal_signs_rating', 'food_beverages_rating', 'airport_shopping_rating', 'wifi_connectivity_rating', 'airport_staff_rating','recommended'], airportCleanDF)


# In[7]:

from pyspark.ml.feature import Tokenizer
tokenizer = Tokenizer().setInputCol('content').setOutputCol('words')


# In[8]:

airlineCleanDF = airlineCleanDF.na.drop(subset=["content"]) # Remove rows with NULL in column 'content'
tokenizer.transform(airlineCleanDF)


# In[7]:

#tokenizer.transform(airlineCleanDF).show(5)


# In[5]:

from pyspark.ml.feature import CountVectorizer


# In[9]:

count_vectorizer_estimator = CountVectorizer().setInputCol('words').setOutputCol('features')
# COMMAND ----------

summary = model.summary
print model.weights
model.gaussiansDF.show()
summary.cluster.show()
summary.clusterSizes
summary.probability.show()


# COMMAND ----------

from pyspark.ml.feature import Tokenizer, CountVectorizer
tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut")
tokenized = tkn.transform(sales.drop("features"))
cv = CountVectorizer()\
  .setInputCol("DescOut")\
  .setOutputCol("features")\
  .setVocabSize(500)\
  .setMinTF(0)\
  .setMinDF(0)\
  .setBinary(True)
cvFitted = cv.fit(tokenized)
prepped = cvFitted.transform(tokenized)


# COMMAND ----------

from pyspark.ml.clustering import LDA
lda = LDA().setK(10).setMaxIter(5)
Example #56
0
# TF: Term Frequency ---> importance of the therm within a given document
# IDF: importance of the term in the corpus (full dictionary of words and documents)

spark = SparkSession.builder.appName('NLP').getOrCreate()

sen_df = spark.createDataFrame([(0, 'Hi I heard about Spark'), (1, 'I whish java could use case classes'), (2, 'Logistic,regression,models,are,neat')], ['id', 'sentence'])

sen_df.show()

tokenizer = Tokenizer(inputCol = 'sentence', outputCol = 'words')
regex_tokenizer = RegexTokenizer(inputCol = 'sentence', outputCol = 'words', pattern = '\\W')

count_tokens = udf(lambda words: len(words), IntegerType())

tokenized = tokenizer.transform(sen_df)
tokenized.withColumn('tokens', count_tokens(col("words"))).show()

# Remvove commas INSIDE the words
rg_tokenized = regex_tokenizer.transform(sen_df)
rg_tokenized.withColumn('tokens', count_tokens(col('words'))).show()

# Remove stop words
sentence_df = spark.createDataFrame([(0, ['I', 'saw', 'the', 'green', 'horse']), (1, ['Mary', 'had', 'a', 'little', 'lamb'])], ['id', 'tokens'])
sentence_df.show()

remover = StopWordsRemover(inputCol = 'tokens', outputCol = 'filtered')
remover.transform(sentence_df).show()

# n-gram
wordDataFrame = spark.createDataFrame([
Example #57
0
import argparse

from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, Row
from pyspark.ml.feature import Tokenizer

def filter_comments(df):
    return df.filter(df['author'] != '[deleted]') \
             .filter(df['body'] != '[deleted]') \
             .filter(df['body'] != '[removed]')

if __name__ == '__main__':

    parser = argparse.ArgumentParser(description='Reddit Comment Prediction')
    parser.add_argument('-i', '--input_file', type=str, 
        help="""The CSV input data file that contains the raw comment data""")
    args = parser.parse_args()

    sc = SparkContext("local", "Prediction")
    sqlContext = SQLContext(sc)
    df = sqlContext.read.json(args.input_file)
    print 'Loaded input file {} with {} total comments'.format(args.input_file, df.count())

    filtered = filter_comments(df)
    print '{} comments after filtering'.format(filtered.count())

    tokenizer = Tokenizer(inputCol="body", outputCol="words")
    wordsDataFrame = tokenizer.transform(filtered)
    wordsDataFrame.select("body", "words").show() 
Example #58
0
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import Tokenizer, CountVectorizer
from pyspark.ml.clustering import LDA
from pyspark.sql import functions as F
from pyspark.sql.types import StructType, StructField, LongType
from pyspark.sql.functions import col

spark = SparkSession.builder.appName('Project').getOrCreate()

dataset=spark.read.csv("reviews.tbl", inferSchema = True, header = True, sep = '|')

dataset.createTempView("product_reviews")
q="SELECT CASE pr_rating WHEN 1 THEN 'NEG' WHEN 2 THEN 'NEG' WHEN 3 THEN 'NEU' WHEN 4 THEN 'POS' WHEN 5 THEN 'POS' END AS pr_r_rating, pr_content FROM product_reviews WHERE pmod(pr_review_id, 5) IN (1,2,3)"
df = spark.sql(q).toDF("label", "sentence")
tokenizer = Tokenizer(inputCol="sentence", outputCol="tokens")
wordsData = tokenizer.transform(df)
# remove stop words
remover = StopWordsRemover(inputCol="tokens", outputCol="words")
cleaned = remover.transform(wordsData)

# vectorize
cv = CountVectorizer(inputCol="words", outputCol="features")
count_vectorizer_model = cv.fit(cleaned)
result = count_vectorizer_model.transform(cleaned)

#corpus = result.select('vectors').rdd.map(lambda x: Row (x[0])).toDF()
#corpus=corpus.select(col("_1").alias("features"))

ldaModel = LDA(k=4, maxIter =100)
model = ldaModel.fit(result)
# extracting topics
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
    featurizedData = hashingTF.transform(wordsData)
    rescaledData = idfModel.transform(featurizedData)
    myprediction = model.transform(rescaledData)
    return myprediction


"""连接master"""
conf = SparkConf().setAppName('tfidf').setMaster('spark://HP-Pavilion:7077')
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
"""处理数据集,生成特征向量"""
dfTitles = sqlContext.read.parquet('roll_news_sina_com_cn.parquet')
print(dfTitles.dtypes)
tokenizer = Tokenizer(inputCol="title", outputCol="words")
wordsData = tokenizer.transform(dfTitles)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20)
featurizedData = hashingTF.transform(wordsData)
idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(featurizedData)
rescaledData = idfModel.transform(featurizedData)
rescaledData.show()
for features_label in rescaledData.select("features", "rawFeatures").take(3):
    print(features_label)
"""决策树模型培训"""
# Index labels, adding metadata to the label column.
# Fit on whole dataset to include all labels in index.
labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(rescaledData)
# Automatically identify categorical features, and index them.
# We specify maxCategories so features with > 4 distinct values are treated as continuous.
featureIndexer =\
Example #60
0
def strip_tags(html):
    return parser.unescape(
        expression.sub('', html)
        )

strip_tags_udf = udf(strip_tags)
tokenizer = Tokenizer(inputCol="comment_clean", outputCol="words")
stopWordsRemover = StopWordsRemover(inputCol="words", outputCol="tokens")

# Load data
comments = sqlContext.read.json(fn)

# Calcualte tokens dataframe as one pipeline
tokens = stopWordsRemover.transform(
             tokenizer.transform(comments\
                 .withColumn("comment_clean", strip_tags_udf(comments["comment_text"]))\
             )\
         )\
         .select(explode("tokens").alias("token"))\
         .groupBy("token")\
         .count()\
         .orderBy("count", ascending=False)\
         .select("count")\
         .limit(1000)

# Switch to Pandas
tokens_pdf = tokens.toPandas()
tokens_pdf = tokens_pdf.ix[1:]
tokens_pdf["rank"] = range(1, tokens_pdf.shape[0] + 1)
print(tokens_pdf.head())