Exemple #1
0
def tokenize(p_df, in_column, out_column):
    """
    Tokenizes a column in a DataFrame.
    :param p_df: A DataFrame.
    :param in_column: Name of the input column.
    :param out_column: Name of the output column.
    :return: A DataFrame.
    """
    tokenizer = RegexTokenizer(inputCol=in_column, outputCol=out_column, pattern="\\W")
    return tokenizer.transform(p_df)
Exemple #2
0
def tokenize(df, column):
    """
    Tokenize alpha-numeric words. Set all tokens to lower-case and 
    remove short terms having less than 3 characters.
    """
    # creates tokenizer based on regular expressions
    wordTokenizer = RegexTokenizer(
        inputCol=column, 
        outputCol='_'+column, 
        pattern='\w+'
    ).setGaps(False) # match tokens rather than gaps
    
    # transform: string --> array<string>
    df = wordTokenizer.transform(df) 
    
    df = replace(df, column, '_'+column)
    return df
def main():

    spark = SQLContext(SparkContext.getOrCreate())

    # read data
    yahoo = spark.read.csv(f'{BUILDDIR}/yahoo.csv', header=True)
    data = yahoo.select(['sector', 'description']).dropna()

    # tokenize texts based on regular expression
    tokenize = RegexTokenizer(inputCol='description',
                              outputCol='words_all',
                              pattern=r'\W')

    # remove stop words
    stopwords = '\n'.join((DATADIR / 'stopwords' / f).read_text().strip()
                          for f in ('mysql.txt', 'nltk.txt')).splitlines()
    remove_stopwords = StopWordsRemover(
        inputCol='words_all', outputCol='words_clean').setStopWords(stopwords)

    # get words frequency using simple count (bag of words)
    add_wordcount = CountVectorizer(inputCol='words_clean',
                                    outputCol='words_count',
                                    vocabSize=1000,
                                    minDF=2)

    # get tf-idf words frequencies
    add_wordtf = HashingTF(inputCol='words_clean',
                           outputCol='words_tf',
                           numFeatures=10000)
    add_wordidf = IDF(inputCol='words_tf',
                      outputCol='words_tfidf',
                      minDocFreq=2)

    # prepare output values
    index_target = StringIndexer(inputCol='sector', outputCol='label')

    # data preparation pipeline
    pipeline_wordcount = Pipeline(stages=[
        tokenize,
        remove_stopwords,
        add_wordcount,
        add_wordtf,
        add_wordidf,
        index_target,
    ])
    # apply data preparation pipeline
    model_wordcount = pipeline_wordcount.fit(data)
    prepared = model_wordcount.transform(data)

    breakpoint()

    # split to training and testing
    training, testing = prepared.randomSplit([0.8, 0.2], seed=100500)

    # fit logistic regression models

    logistic_wordcount = LogisticRegression(regParam=0.3,
                                            elasticNetParam=0,
                                            featuresCol='words_count',
                                            labelCol='label',
                                            predictionCol='prediction',
                                            probabilityCol='probability')

    logistic_tfidf = LogisticRegression(regParam=0.3,
                                        elasticNetParam=0,
                                        featuresCol='words_tfidf',
                                        labelCol='label',
                                        predictionCol='prediction',
                                        probabilityCol='probability')

    evaluator = MulticlassClassificationEvaluator(predictionCol='prediction',
                                                  metricName='accuracy')
    for model, name in ((logistic_wordcount,
                         'Word count + Logistic regression'),
                        (logistic_tfidf, 'TF-IDF + Logistic regression')):
        predicted = model.fit(training).transform(testing)
        print(f'{name} model accuracy = {evaluator.evaluate(predicted)}')
Exemple #4
0
    .orderBy(col("count").desc()) \
    .show()

data.groupBy("SentimentText") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

# set seed for reproducibility
(trainingData, testData) = data.randomSplit([0.7, 0.3], seed=100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="SentimentText",
                                outputCol="words",
                                pattern="\\W")

# stop words
add_stopwords = ["http", "https", "amp", "rt", "t", "c", "the"]
stopwordsRemover = StopWordsRemover(
    inputCol="words", outputCol="filtered").setStopWords(add_stopwords)

# bag of words count
countVectors = CountVectorizer(inputCol="filtered",
                               outputCol="features",
                               vocabSize=10000,
                               minDF=5)

# convert string labels to indexes
label_stringIdx = StringIndexer(inputCol="Sentiment", outputCol="label")
Exemple #5
0
# MAGIC %md
# MAGIC Split the Wikipedia text into sentences.

# COMMAND ----------

pattern = r"(\. |\n{2,})"
import re

matches = re.findall(pattern, "Wiki page. *More information*\n\n And a line\n that continues.")
print matches

# COMMAND ----------

from pyspark.ml.feature import RegexTokenizer

tokenizer = RegexTokenizer(inputCol="text", outputCol="sentences", pattern=pattern)
sentences = tokenizer.transform(parsed).select("sentences")
display(sentences)

# COMMAND ----------

from pyspark.sql import Row
from pyspark.sql.types import StructType, StructField, StringType

sentenceRDD = sentences.flatMap(lambda r: r[0]).map(lambda x: Row(sentence=x))

sentenceSchema = StructType([StructField("sentence", StringType())])
sentence = sqlContext.createDataFrame(sentenceRDD, sentenceSchema)

display(sentence)
train_df.show()
mapping_df.show()
df1=train_df.select("genre")
#train_df.column("genre")
df1.show(5)

train_df.printSchema()

from pyspark.ml.feature import Tokenizer, RegexTokenizer


#plotToken = Tokenizer(inputCol="plot", outputCol="splitWords")
#plotToken.transform(dataset).head()
#dataset.na.drop(subset=["plot"])
regexToeknizer = RegexTokenizer(inputCol="plot", outputCol="tokens", pattern="\\W")
#dataset = regexToeknizer.transform(train_df)
#dataset.printSchema()
#dataset.select("plot").show(5)

from pyspark.ml.feature import StopWordsRemover
remover= StopWordsRemover(inputCol="tokens", outputCol="stopRemove")
#dataset=remover.transform(dataset)
#dataset.printSchema()
#dataset.show(2)

from pyspark.ml.feature import CountVectorizer,HashingTF
from pyspark.sql.functions import when, col, coalesce, array
from pyspark.ml import Pipeline

#fillNull = array().cast("array<string>") #solution for null handel from stackOverflow
Exemple #7
0
def load_data(manifest, base='gs', kind='bytes'):
    '''Load data from a manifest file into a DataFrame.

    A manifest file gives the hash identifying each document on separate lines.

    The returned DataFrame has columns `id`, `url`, and `text` where `id`
    is a document identifier, `url` is the path to the document, and `text`
    is the contents.

    Note that the document ID is _not_ the same as the hash. The ID is
    guaranteed to uniquely identify one document and the order of the IDs is
    guaranteed to match the order given in the manifest file.

    Args:
        manifest (path):
            Path or URL of the manifest file.
        base (path):
            The base of the URL or path to the data. The special strings 'gs'
            and 'https' expand to the URLs used by Data Science Practicum at
            UGA over the Google Storage and HTTPS protocols respectivly.
        kind (str):
            The kind of file to use, one of 'bytes' or 'asm'.
            - 'bytes' loads hex strings for the bytes in the binary files.
            - 'asm' loads segment titles and the opcodes from the asm files.

    Returns:
        DataFrame[id: bigint, url: string, text: string]
    '''
    spark = elizabeth.session()
    ctx = spark.sparkContext

    # Special base paths
    if base == 'https': base = 'https://storage.googleapis.com/uga-dsp/project2/data'
    if base == 'gs': base = 'gs://uga-dsp/project2/data'

    kind = 'bytes'
    if kind != 'bytes':
        kind='asm'

    # Read the manifest as an iterator over (id, url).
    # We use Spark to build the iterator to support hdfs etc.
    manifest = str(manifest)  # cast to str to support pathlib.Path etc.
    manifest = ctx.textFile(manifest)                           # RDD[hash]
    manifest = manifest.map(hash_to_url(base=base, kind=kind))  # RDD[url]
    manifest = manifest.zipWithIndex()                          # RDD[url, id]
    manifest = manifest.map(lambda x: (x[1], x[0]))             # RDD[id, url]
    manifest = manifest.toLocalIterator()                       # (id, url)

    # Load all files in the base directoy, then join out the ones in the manifest.
    prepend = lambda *args: lambda x: (*args, *x)
    data = ((id, ctx.wholeTextFiles(url)) for id, url in manifest)  # (id, RDD[url, text])
    data = [rdd.map(prepend(id)) for id, rdd in data]               # [RDD[id, url, text]]
    data = ctx.union(data)                                          # RDD[id, url, text]
    data = data.toDF(['id', 'url', 'text'])                         # DF[id, url, text]

    # Tokenization : DF[id, url, text, tokens]
    tokenizer = RegexTokenizer(inputCol='text', outputCol='features', gaps=False)
    opcodes = '|'.join(_opcodes)
    if kind == 'bytes': tokenizer.setPattern('(?<= )[0-9A-F]{2}')
    elif kind == 'asm': tokenizer.setPattern(r'(\.?\w+:(?=[0-9A-F]{8}\s))|(\b(' + opcodes + r')\b)')
    data = tokenizer.transform(data)
    data = data.drop('text')

    return data
def main(argv=None):
    if argv is None:
        inputs_train = sys.argv[1]
        inputs_test = sys.argv[2]

    conf = SparkConf().setAppName('sentiment-analysis-word2vec-cluster')
    sc = SparkContext(conf=conf)
    sqlCt = SQLContext(sc)

    #read train json file and prepare data (label, feature)
    text = sqlCt.read.json(inputs_train)
    train = text.select('overall',
                        'reviewText').withColumnRenamed('overall', 'label')
    train.cache()

    ## DATA PROCESSING PIPELINE
    # Split at whitespace and characters that are not letter
    tokenizer = RegexTokenizer(inputCol="reviewText",
                               outputCol="words",
                               pattern="\\P{Alpha}+")

    # stopword remover
    remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")

    pipeline_data_processing = Pipeline(stages=[tokenizer, remover])
    model_data_processing = pipeline_data_processing.fit(train)
    train_processed = model_data_processing.transform(train)
    train.unpersist()
    train_processed.cache()

    ## INTERMEDIATE STEP TO GET WORD VOCABULARY AND VECTOR
    # word2vec
    word2Vec = Word2Vec(inputCol="filtered_words",
                        outputCol="word2vec_features")
    model_word2Vec = word2Vec.fit(train_processed)
    # Dataframe dictionary of Word-vectors
    vocabulary = model_word2Vec.getVectors()
    vocabulary.cache()

    ## ML PIPELINE
    # WordCluster Features
    wordcluster = WordCluster(inputCol="filtered_words", predictionCol="cluster", \
                              k=3, vocabulary=vocabulary)

    # get vector of cluster frequency for each document
    count_vectorizer = CountVectorizer(inputCol="cluster", outputCol="count")

    # normalized cluster frequency vector for each document
    normalizer = Normalizer(inputCol="count", outputCol="features", p=1.0)

    # linear Regression Model
    lr = LinearRegression(maxIter=20, regParam=0.1)

    # Final Pipeline
    pipeline = Pipeline(stages=[wordcluster, count_vectorizer, normalizer, lr])

    ## FIT MODEL USING CROSS VALIDATION
    # Parameter grid for cross validation: numFeatures and regParam
    paramGrid = ParamGridBuilder() \
            .addGrid(wordcluster.k, [1000, 5000, 10000, 20000]) \
            .addGrid(lr.regParam, [0.001, 0.01, 0.1, 1.0]) \
            .build()

    # 5-fold cross validation
    evaluator = RegressionEvaluator(metricName="rmse")
    crossval = CrossValidator(estimator=pipeline,
                              estimatorParamMaps=paramGrid,
                              evaluator=evaluator,
                              numFolds=5)

    # Run cross-validation, and choose the best set of parameters.
    model = crossval.fit(train_processed)

    # RMSE on train data
    prediction_train = model.transform(train_processed)
    rmse_train = evaluator.evaluate(prediction_train)
    train_processed.unpersist()
    vocabulary.unpersist()

    ## TEST DATA
    #read test json file and process data (label, feature)
    text = sqlCt.read.json(inputs_test)
    test = text.select('overall',
                       'reviewText').withColumnRenamed('overall', 'label')
    test_processed = model_data_processing.transform(test)

    # Evaluate the model on test data
    prediction_test = model.transform(test_processed)
    rmse_test = evaluator.evaluate(prediction_test)

    # Print Result
    result = "MODEL WITH Word Clustering features - best k = " \
          + str(model.bestModel.stages[0].getK()) + ":\n"
    result = result + "-Train RMSE: " + str(rmse_train) + "\n"
    result = result + "-Test RMSE: " + str(rmse_test) + "\n"
    print(result)
Exemple #9
0
sc = SparkContext(appName="Tweet")
spark = SparkSession(sc)

sc.setLogLevel("WARN")

# Elastic Search
conf = SparkConf(loadDefaults=False)
conf.set("es.index.auto.create", "true")
# read the dataset  
training_set = spark.read.csv('../tap/spark/dataset/training_set_sentipolc16.csv',
                         schema=schema,
                         header=True,
                         sep=',')

# define stage 1: tokenize the tweet text    
stage_1 = RegexTokenizer(inputCol= 'tweet' , outputCol= 'tokens', pattern= '\\W')
# define stage 2: remove the stop words
stage_2 = StopWordsRemover(inputCol= 'tokens', outputCol= 'filtered_words')
# define stage 3: create a word vector of the size 100
stage_3 = Word2Vec(inputCol= 'filtered_words', outputCol= 'vector', vectorSize= 100)
# define stage 4: Logistic Regression Model
model = LogisticRegression(featuresCol= 'vector', labelCol= 'positive')
# setup the pipeline
pipeline = Pipeline(stages= [stage_1, stage_2, stage_3, model])

# fit the pipeline model with the training data
pipelineFit = pipeline.fit(training_set)

modelSummary=pipelineFit.stages[-1].summary
modelSummary.accuracy
from pyspark.sql import SparkSession
from pyspark.sql import SQLContext
from pyspark.ml import Pipeline
from kafka import KafkaConsumer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

#-------------------Building the logistic regression and naive bayes pipelines----------------------------------

if __name__ == "__main__":
    sc = SparkContext.getOrCreate()
    sc.setLogLevel("ERROR")

    sqlContext = SQLContext(sc)

    regex_tokenizer = RegexTokenizer(inputCol="text",
                                     outputCol="words",
                                     pattern="\\W")

    stop_words = []
    with open('/home/asdf/Documents/stopwords.txt', 'r') as contents:
        stop_words = contents.read().split()

    stop_words_remover = StopWordsRemover(
        inputCol="words", outputCol="filtered").setStopWords(stop_words)

    count_vectors = CountVectorizer(inputCol="filtered",
                                    outputCol="features",
                                    vocabSize=10000,
                                    minDF=5)

    lr = LogisticRegression(maxIter=100, regParam=0.01)
Exemple #11
0
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("TokenizerExample")\
        .getOrCreate()

    # $example on$
    sentenceDataFrame = spark.createDataFrame([
        (0, "Hi I heard about Spark"),
        (1, "I wish Java could use case classes"),
        (2, "Logistic,regression,models,are,neat")
    ], ["label", "sentence"])

    tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

    regexTokenizer = RegexTokenizer(inputCol="sentence", outputCol="words", pattern="\\W")
    # alternatively, pattern="\\w+", gaps(False)

    tokenized = tokenizer.transform(sentenceDataFrame)
    for words_label in tokenized.select("words", "label").take(3):
        print(words_label)

    regexTokenized = regexTokenizer.transform(sentenceDataFrame)
    for words_label in regexTokenized.select("words", "label").take(3):
        print(words_label)
    # $example off$

    spark.stop()
    else:
        return "../examples/smalldata/" + file_name

## This method loads the data, perform some basic filtering and create Spark's dataframe
def load():
    row_rdd = spark.sparkContext.textFile(_locate("smsData.txt")).map(lambda x: x.split("\t", 1)).filter(lambda r: r[0].strip())
    return spark.createDataFrame(row_rdd, ["label", "text"])

##
## Define the pipeline stages
##

## Tokenize the messages
tokenizer = RegexTokenizer(inputCol="text",
                           outputCol="words",
                           minTokenLength=3,
                           gaps=False,
                           pattern="[a-zA-Z]+")

## Remove ignored words
stopWordsRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                                    outputCol="filtered",
                                    stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"],
                                    caseSensitive=False)

## Hash the words
hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(),
                      outputCol="wordToIndex",
                      numFeatures=1 << 10)

## Create inverse document frequencies model

# COMMAND ----------

from pyspark.ml.feature import Tokenizer
tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut")
tokenized = tkn.transform(sales.select("Description"))
tokenized.show(20, False)


# COMMAND ----------

from pyspark.ml.feature import RegexTokenizer
rt = RegexTokenizer()\
  .setInputCol("Description")\
  .setOutputCol("DescOut")\
  .setPattern(" ")\
  .setToLowercase(True)
rt.transform(sales.select("Description")).show(20, False)


# COMMAND ----------

from pyspark.ml.feature import RegexTokenizer
rt = RegexTokenizer()\
  .setInputCol("Description")\
  .setOutputCol("DescOut")\
  .setPattern(" ")\
  .setGaps(False)\
  .setToLowercase(True)
rt.transform(sales.select("Description")).show(20, False)
Exemple #14
0
import pandas as pd
from pyspark.sql import SQLContext
from pyspark.ml.feature import RegexTokenizer, HashingTF
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.classification import LogisticRegressionWithSGD, SVMWithSGD, NaiveBayes
from pyspark.mllib.tree import RandomForest

## Load Dataset
df_pandas = pd.read_csv('sample.csv')

## Convert to Spark Dataframe
sqlContext = SQLContext(sc)
df = sqlContext.createDataFrame(df_pandas)

## Tokenizer and Hashing 
tokenizer = RegexTokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(numFeatures=10000, inputCol="words", outputCol="features")
df_feat = hashingTF.transform(tokenizer.transform(df))

## Create LabeledPoint and Features for Prediction (predict the 1s observations)
lp = df_feat.map(lambda x: LabeledPoint(x.label, x.features))
predict_feat = df_feat.where(df_feat.label == 1).map(lambda x: x.features)


## Compare predictions from Different Models


## Logistic Regression
lrm = LogisticRegressionWithSGD.train(lp, iterations=10)
logit_predict = lrm.predict(predict_feat)
logit_predict.sum()
test_df = test_df.withColumnRenamed('_2', 'id')
test_df = test_df.select('id', 'category', 'text')

writeToFile("\nTask 1.1 (b)\n")
writeToFile("First 5 rows of 'INDEXED' test set \n\n")
k = test_df.take(5)
for i, row in enumerate(k):
    row_name = 'Row-' + str(i)
    writeToFile(row_name + '\n')
    writeToFile(str(row[0]) + ', ' + str(row[1]) + ', ' + str(row[2]) + '\n\n')

########################################################################################################
# Build pipeline and run
indexer = StringIndexer(inputCol="category", outputCol="label")
tokenizer = RegexTokenizer(pattern=u'\W+',
                           inputCol="text",
                           outputCol="words",
                           toLowercase=False)
hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures")
idf = IDF(inputCol="rawFeatures", outputCol="features")
lr = LogisticRegression(maxIter=20, regParam=0.001)

# Builing model pipeline
pipeline = Pipeline(stages=[indexer, tokenizer, hashingTF, idf, lr])

# Train model on training set
model = pipeline.fit(
    train_df
)  #if you give new names to your indexed datasets, make sure to make adjustments here

# Model prediction on test set
pred = model.transform(test_df)  # ...and here
Exemple #16
0
data.groupBy("text") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

# set seed for reproducibility
(trainingData, testData) = data.randomSplit([0.7, 0.3], seed=100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))
trainingData.printSchema()
trainingData.show(5)
testData.show(5)

# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words")

# stop words
add_stopwords = ["http", "https", "amp", "rt", "t", "c", "the", "RT"]
stopwordsRemover = StopWordsRemover(
    inputCol="words", outputCol="filtered").setStopWords(add_stopwords)

# bag of words count
countVectors = CountVectorizer(inputCol="filtered",
                               outputCol="features",
                               vocabSize=10000,
                               minDF=5)

# convert string labels to indexes
label_stringIdx = StringIndexer(inputCol="airline_sentiment",
                                outputCol="label")
from pyspark.ml.feature import (RegexTokenizer, StopWordsRemover, Word2Vec)

_regex_tokenizer = RegexTokenizer(inputCol='tweet',
                                  outputCol='tokens',
                                  pattern='\\W')

_stop_word_remover = StopWordsRemover(inputCol='tokens',
                                      outputCol='filtered_words')

_word_2_vec = Word2Vec(
    inputCol='filtered_words',
    outputCol='vector',
    vectorSize=100,
    minCount=5,
    numPartitions=1,
    stepSize=0.025,
    maxIter=1,
)

TRANSFORMERS = [_regex_tokenizer, _stop_word_remover, _word_2_vec]
Exemple #18
0
# MAGIC %md
# MAGIC Split the Wikipedia text into sentences.

# COMMAND ----------

pattern = r'(\. |\n{2,})'
import re
matches = re.findall(pattern, 'Wiki page. *More information*\n\n And a line\n that continues.')
print matches

# COMMAND ----------

from pyspark.ml.feature import RegexTokenizer

tokenizer = RegexTokenizer(inputCol='text', outputCol='sentences', pattern=pattern)
sentences = tokenizer.transform(parsed).select('sentences')
display(sentences)

# COMMAND ----------

from pyspark.sql import Row
from pyspark.sql.types import StructType, StructField, StringType

sentenceRDD = (sentences
               .flatMap(lambda r: r[0])
               .map(lambda x: Row(sentence=x)))

sentenceSchema = StructType([StructField('sentence', StringType())])
sentence = sqlContext.createDataFrame(sentenceRDD, sentenceSchema)
Exemple #19
0
if __name__ == "__main__":
    time1 = datetime.datetime.now()
    spark = SparkSession\
        .builder\
        .appName("news")\
        .config("spark.some.config.option", "some-value")\
        .getOrCreate()
    bucket_path = sys.argv[1]
    news_data = spark.read.csv(bucket_path, header='True', inferSchema='True')
    title_category = news_data.select("TITLE", "CATEGORY")
    title_category = title_category.dropna()
    title_category = title_category.withColumn(
        "only_str", regexp_replace(col('TITLE'), '\d+', ''))
    regex_tokenizer = RegexTokenizer(inputCol="only_str",
                                     outputCol="words",
                                     pattern="\\W")
    raw_words = regex_tokenizer.transform(title_category)
    remover = StopWordsRemover(inputCol="words", outputCol="filtered")
    words_df = remover.transform(raw_words)
    indexer = StringIndexer(inputCol="CATEGORY", outputCol="categoryIndex")
    feature_data = indexer.fit(words_df).transform(words_df)
    cv = CountVectorizer(inputCol="filtered", outputCol="features")
    model = cv.fit(feature_data)
    countVectorizer_feateures = model.transform(feature_data)
    (trainingData,
     testData) = countVectorizer_feateures.randomSplit([0.8, 0.2], seed=11)
    nb = NaiveBayes(modelType="multinomial",
                    labelCol="categoryIndex",
                    featuresCol="features")
    nbModel = nb.fit(trainingData)
Exemple #20
0
# by top 20 categories
data.groupBy("Category") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()

# by top 20 descriptions
data.groupBy("Descript") \
    .count() \
    .orderBy(col("count").desc()) \
    .show()


# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="Descript", outputCol="words", pattern="\\W")

# stop words
add_stopwords = ["http","https","amp","rt","t","c","the"] # standard stop words

stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(add_stopwords)

# bag of words count
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)


label_stringIdx = StringIndexer(inputCol = "Category", outputCol = "label")

transformers=[regexTokenizer, stopwordsRemover, countVectors, label_stringIdx]

pipeline = Pipeline(stages=transformers)
                           (when(col("comment").like("%my dog%"), 1) \
                           .when(col("comment").like("%I have a dog%"), 1) \
                           .when(col("comment").like("%my cat%"), 1) \
                           .when(col("comment").like("%I have a cat%"), 1) \
                           .when(col("comment").like("%my puppy%"), 1) \
                           .when(col("comment").like("%my pup%"), 1) \
                           .when(col("comment").like("%my kitty%"), 1) \
                           .when(col("comment").like("%my pussy%"), 1) \
                           .otherwise(0)))
df_clean.show()
# 1. Data preprocesing and build the classifier
from pyspark.ml.feature import RegexTokenizer, Word2Vec
from pyspark.ml.classification import LogisticRegression

# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="comment", outputCol="words", pattern="\\W")

word2Vec = Word2Vec(inputCol="words", outputCol="features")
from pyspark.ml import Pipeline

pipeline = Pipeline(stages=[regexTokenizer, word2Vec])

# Fit the pipeline to training documents.
pipelineFit = pipeline.fit(df_clean)
dataset = pipelineFit.transform(df_clean)
dataset.show()
# Remove the emtpy features caused by none English statements. 
from pyspark.sql.functions import col
from pyspark.sql.types import BooleanType
from pyspark.sql.functions import udf
from pyspark.ml.feature import RegexTokenizer
from pyspark.sql import Row

# In[58]:

# TODO - Change this directory to the right location where the data is stored
dataDir = "/Users/RajT/Downloads/20_newsgroups/*"
# Read the entire text into a DataFrame
textRDD = sc.wholeTextFiles(dataDir).map(lambda recs: Row(sentence=recs[1]))
textDF = spark.createDataFrame(textRDD)

# In[59]:

# Tokenize the sentences to words
regexTokenizer = RegexTokenizer(inputCol="sentence",
                                outputCol="words",
                                gaps=False,
                                pattern="\\w+")
tokenizedDF = regexTokenizer.transform(textDF)

# In[60]:

# Prepare the Estimator
# It sets the vector size, and the parameter minCount sets the minimum number of times a token must appear to be included in the word2vec model's vocabulary.
word2Vec = Word2Vec(vectorSize=3,
                    minCount=0,
                    inputCol="words",
                    outputCol="result")
# Train the model
model = word2Vec.fit(tokenizedDF)

# In[61]:
df_clean.show()

# COMMAND ----------

# MAGIC %md
# MAGIC #### 1. Data preprocessing and Build the classifier
# MAGIC To train a model against comments, we use RegexTokenizer to split each comment into a list of words and then use Word2Vec to convert the list to a word vector. Word2Vec map each word to a unique fixed-size vector and then transform each document into a vector using the average of all words in the document.

# COMMAND ----------

# data preprocessing
from pyspark.ml.feature import RegexTokenizer

regexTokenizer = RegexTokenizer(inputCol="comment",
                                outputCol="text",
                                pattern="\\W")
df_clean = regexTokenizer.transform(df_clean)
df_clean.show(10)

# COMMAND ----------

# MAGIC %md
# MAGIC ##### Alert: First try is to use 1,000,000 rows for testing

# COMMAND ----------

from pyspark.sql.functions import rand

df_clean.orderBy(rand(seed=0)).createOrReplaceTempView("table1")
df_clean = spark.sql("select * from table1 limit 1000000")

# Generate ensemble with Random Forest. Accuracy ~ 0.203
if __name__ == "__main__":
    spark = getSparkSession()
    passengers = readPassengersWithCastingToDoubles(spark).select(
        "survived", "pclass", "sibsp", "parch", "sex", "embarked", "age",
        "fare", "name")

    training, test = passengers.randomSplit([0.7, 0.3], seed=12345)
    training.cache()
    test.cache()

    regexTokenizer = RegexTokenizer(gaps=False,
                                    pattern="\\w+",
                                    inputCol="name",
                                    outputCol="name_parts",
                                    toLowercase=True)

    stopWords = ["mr", "mrs", "miss", "master", "jr", "j", "c", "d"]

    remover = StopWordsRemover(inputCol="name_parts",
                               outputCol="filtered_name_parts",
                               stopWords=stopWords)

    hashingTF = HashingTF(numFeatures=1000,
                          inputCol="filtered_name_parts",
                          outputCol="text_features")

    sexIndexer = StringIndexer(inputCol="sex",
                               outputCol="sexIndexed",
Exemple #25
0
    get_topic_0,
    StructType([
        StructField("cluster_id", IntegerType()),
        StructField("score", FloatType()),
        StructField("value", StringType())
    ]))

#newDF = df.withColumn("title", udf_get_title(df.marc)).withColumn("marc_subjects", udf_get_subjects(df.marc))
newDF = df.withColumn("allTextString", udf_flatten_text(df.allTextArray))

df = newDF

df.show(10, False)

tokenizer = RegexTokenizer(inputCol="allTextString",
                           outputCol="word_tokens",
                           pattern="\\W")
TokenizerData = tokenizer.transform(df)
df = TokenizerData

remover = StopWordsRemover(inputCol="word_tokens", outputCol="stop_removed")
my_sw = [
    'united', 'states', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k',
    'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z'
]
sw = remover.loadDefaultStopWords("english")
remover.setStopWords(sw + my_sw)
StopWordsRemoverData = remover.transform(df)
df = StopWordsRemoverData

cv = CountVectorizer(inputCol="stop_removed",
        return "../examples/smalldata/" + file_name

## This method loads the data, perform some basic filtering and create Spark's dataframe
def load():
    row_rdd = spark.sparkContext.textFile(_locate("smsData.txt")).map(lambda x: x.split("\t", 1)).filter(lambda r: r[0].strip())
    return spark.createDataFrame(row_rdd, ["label", "text"])


##
## Define the pipeline stages
##

## Tokenize the messages
tokenizer = RegexTokenizer(inputCol="text",
                           outputCol="words",
                           minTokenLength=3,
                           gaps=False,
                           pattern="[a-zA-Z]+")

## Remove ignored words
stopWordsRemover = StopWordsRemover(inputCol=tokenizer.getOutputCol(),
                                    outputCol="filtered",
                                    stopWords=["the", "a", "", "in", "on", "at", "as", "not", "for"],
                                    caseSensitive=False)

## Hash the words
hashingTF = HashingTF(inputCol=stopWordsRemover.getOutputCol(),
                      outputCol="wordToIndex",
                      numFeatures=1 << 10)

## Create inverse document frequencies model
Exemple #27
0
sqlContext = SQLContext(sc)
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import HashingTF, IDF
from pyspark.sql import SparkSession
# Build a SparkSession; SparkSession provides a single point of entry to interact with underlying Spark functionality
spark = SparkSession\
    .builder\
    .appName("similairityExample")\
    .getOrCreate()

df = sqlContext.read.json('/home/sl4401/AA/wiki_**')
df = df.limit(1000)
regexTokenizer = RegexTokenizer(inputCol="text", outputCol="words", pattern="[^A-Za-z]+", toLowercase=True)
tokenized_data = regexTokenizer.transform(df)
stopWordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
filtered_data = stopWordsRemover.transform(tokenized_data)
hashingTF = HashingTF(inputCol="filtered_words", outputCol="raw_features", numFeatures=20)
featurizedData = hashingTF.transform(filtered_data)
idf= IDF(inputCol="raw_features", outputCol="features")
idfModel = idf.fit(featurizedData)
featurized_data = idfModel.transform(featurizedData)
from pyspark.ml.feature import Normalizer
normalizer = Normalizer(inputCol="features", outputCol="norm")
data = normalizer.transform(featurized_data)
import math
import pyspark.sql.functions as psf
from pyspark.sql.types import DoubleType
dot_udf = psf.udf(lambda x,y: float(x.dot(y)), DoubleType())
Exemple #28
0
        "C:/Users/yuy/Desktop/kaggle/movie/test.tsv",
        format='com.databricks.spark.csv',
        header="true",
        delimiter="\t",
        inferSchema="true",
        mode="DROPMALFORMED")

    train_raw = raw_data.select("PhraseId", "SentenceId", "Phrase")
    train_size = train_raw.count()
    test_size = test_data.count()
    #concate the two data set for tokenization and countvector
    total_data = train_raw.union(test_data)

    #regex tokenization
    tokenizer = RegexTokenizer(inputCol="Phrase",
                               outputCol="words",
                               pattern="\\w+",
                               gaps=False)
    wordsDF = tokenizer.transform(total_data)
    #wordsDF.select("words").show(3, False)

    #dont remove stop words because in test file there's stop words to be predicted

    #count vectorized the word tokens
    cv = CountVectorizer(inputCol="words", outputCol="features")
    word_vec = cv.fit(wordsDF).transform(wordsDF).select(
        "features", "PhraseId")
    #word_vec.select("features").show(3)

    #IDF
    idf = IDF(inputCol="features", outputCol="IDF_features")
    idfModel = idf.fit(word_vec).transform(word_vec)
Exemple #29
0
        header = False,
        escape = "\"",
        schema = StructType([StructField("reviewId", IntegerType(), True),
                             StructField("asin", StringType(), True),
                             StructField("reviewText", StringType(), True)]))
    df = df.drop("asin")
    df = df.repartition(20)

    # # Use nltk.word_tokenizer to tokenize words
    # @udf(ArrayType(StringType()))
    # def tokenize(string):
    #     return word_tokenize(string)

    # df = df.withColumn("words", tokenize("reviewText"))

    df = RegexTokenizer(inputCol="reviewText", outputCol="words", pattern="\\W").transform(df)
    df = df.drop("reviewText")

    cv_model = CountVectorizer(inputCol="words", outputCol="tf").fit(df)
    vocabulary = cv_model.vocabulary

    df = cv_model.transform(df)
    df = df.drop("words")
    df.cache()

    df = IDF(inputCol="tf", outputCol="tfidf").fit(df).transform(df)
    df = df.drop("tf")
    df.unpersist()

    @udf(MapType(StringType(), FloatType()))
    def create_map(vector):
         ),
        (2,
         "\"You don't mind your honor?\" he asked Tushin. \"I've lost my company, your honor. I don't know where... such bad luck!'\""
         )
    ], "id int, message string")

    df.show(truncate=False)

    #A tokenizer that converts the input string to lowercase and then splits it by white spaces.
    words = Tokenizer(inputCol="message", outputCol="words").transform(df)
    words.show(truncate=False)

    # 正規表達法資料清洗及斷詞
    # A regex based tokenizer that extracts tokens either by using the provided regex pattern (in Java dialect) to split the text (default) or repeatedly matching the regex (if gaps is false). Optional parameters also allow filtering tokens using a minimal length. It returns an array of strings that can be empty.
    words = RegexTokenizer(inputCol="message",
                           outputCol="words",
                           pattern="\\W+").transform(df)
    words.show(truncate=False)

    # StopWordsRemover is feature transformer that filters out stop words from input.
    stop_words_removed = StopWordsRemover(
        inputCol="words", outputCol="stop_words_removed").transform(words)
    stop_words_removed.show(truncate=False)

    # 變成n字一組
    # NGram is a feature transformer that converts the input array of strings into an array of n-grams. Null values in the input array are ignored. It returns an array of n-grams where each n-gram is represented by a space-separated string of words. When the input is empty, an empty array is returned. When the input array length is less than n (number of elements per n-gram), no n-grams are returned.
    ngram_df = NGram(n=2, inputCol="words",
                     outputCol="ngrams").transform(words)

    ngram_df.show(truncate=False)
    ngram_df.select("ngrams").show(truncate=False)
Exemple #31
0
    StructField("dateline", StringType(), True),
    StructField("headline", StringType(), True),
    StructField("metadata", StringType(), True),
    StructField("dc", StringType(), True),
    StructField("text", StringType(), False),
    StructField("title", StringType(), True),
])  #customized dataFrame

xmlDf = spark.read.format('com.databricks.spark.xml').options(
    rowTag='newsitem').load('hdfs://hadoop-dbse/user/pasumart/input_dataset/*',
                            schema=custom).limit(10000)

df_first = xmlDf.select(col('_itemid').alias('DocId'), 'text')

tokens = RegexTokenizer(minTokenLength=2,
                        inputCol='text',
                        outputCol='Words',
                        pattern="[^a-z]+")  #tokenizing the sentences to words

Tokenized = tokens.transform(df_first)

Tokens_filtered = StopWordsRemover(
    inputCol='Words',
    outputCol='filtered_words')  #filtering the words that contains stopwords

Tokenized_filtered = Tokens_filtered.transform(Tokenized)

cv = CountVectorizer(
    inputCol="filtered_words", outputCol="features"
)  #constructing a matrix that maps each unique word to a unique ID

cv_model = cv.fit(Tokenized_filtered)
Exemple #32
0
# In[8]:

from pyspark.ml import Pipeline
from pyspark.ml.feature import StringIndexer, OneHotEncoderEstimator, VectorAssembler
from pyspark.ml.feature import StopWordsRemover, Word2Vec, RegexTokenizer
from pyspark.sql.functions import udf, col, lower, regexp_replace

#Cleaning the text replacing every thing except [^a-zA-Z\\s] with a empty space
data_clean = data.select(
    'id', (lower(regexp_replace('tweet', "[^a-zA-Z\\s]", "")).alias('tweet')))

# In[9]:

#Now building a model pipeline
stage1 = RegexTokenizer(inputCol="tweet", outputCol="tokens", pattern="\\W")
stage2 = StopWordsRemover(inputCol="tokens", outputCol="filtered_words")
stage3 = Word2Vec(inputCol="filtered_words",
                  outputCol="vector",
                  vectorSize=1000)

# Can also use the below code for text preprocessing and cleaning

# In[ ]:

#from nltk.stem.snowball import SnowballStemmer
#stemmer = SnowballStemmer(language='english')

#Tokenizing the text
#tokenizer = Tokenizer(inputCol="tweet", outputCol="words_token")
#df_words_token = tokenizer.transform(data_clean).select('id', 'words_token')
Exemple #33
0
def content_userid(self, file1, file2, input_model, u_id, sim_bus_limit=3):

    from pyspark import SparkContext
    from pyspark.sql import SparkSession
    sparkconf_builder = spark_celery_app.sparkconf_builder
    spark_conf = sparkconf_builder()
    sc = SparkContext.getOrCreate(conf=spark_conf)
    spark = SparkSession.builder.config(conf=spark_conf).getOrCreate()

    data = spark.read.parquet(file1)
    data.createOrReplaceTempView('review')
    df_business = spark.read.parquet(file2)
    schema = StructType([
        StructField("business_id", StringType(), True),
        StructField("score", IntegerType(), True),
        StructField("input_business_id", StringType(), True)
    ])

    similar_businesses_df = spark.createDataFrame([], schema)
    df = data.select('business_id', 'text')
    #df_review = df.groupby('business_id').agg(functions.collect_set('text')).show(100)
    review_rdd = df.rdd.map(tuple).reduceByKey(operator.add)
    review_df = spark.createDataFrame(review_rdd).withColumnRenamed(
        '_1', 'business_id').withColumnRenamed('_2', 'text')

    # create text preprocessing pipeline
    # Build the pipeline
    # tokenize review
    regexTokenizer = RegexTokenizer(gaps=False,
                                    pattern='\w+',
                                    inputCol='text',
                                    outputCol='text_token')
    #yelpTokenDF = regexTokenizer.transform(review_df)

    # filter stopwords
    stopWordsRemover = StopWordsRemover(inputCol='text_token',
                                        outputCol='nonstopwrd')
    #yelp_remove_df = stopWordsRemover.transform(yelpTokenDF)

    # TF
    countVectorizer = CountVectorizer(inputCol='nonstopwrd',
                                      outputCol='raw_features',
                                      minDF=2)
    #yelp_CountVec = cv.transform(yelp_remove_df)

    # IDF
    idf = IDF(inputCol="raw_features", outputCol="idf_vec")
    word2Vec = Word2Vec(vectorSize=500,
                        minCount=5,
                        inputCol='nonstopwrd',
                        outputCol='word_vec',
                        seed=123)
    #vectorAssembler = VectorAssembler(inputCols=['idf_vec', 'word_vec'], outputCol='comb_vec')
    pipeline = Pipeline(stages=[
        regexTokenizer, stopWordsRemover, countVectorizer, idf, word2Vec
    ])
    #pipeline_model = pipeline.fit(review_df)
    #pipeline_model.write().overwrite().save('content_userid')

    pipeline_model = PipelineModel.load(input_model)
    reviews_by_business_df = pipeline_model.transform(review_df)
    all_business_vecs = reviews_by_business_df.select(
        'business_id', 'word_vec').rdd.map(lambda x: (x[0], x[1])).collect()
    usr_rev_bus = spark.sql(
        'SELECT distinct business_id FROM review where stars >= 3.0 and user_id = "{}"'
        .format(u_id))

    bus_list = [i for i in usr_rev_bus.collect()]

    for b_id in bus_list:
        input_vec = [(r[1]) for r in all_business_vecs if r[0] == b_id[0]][0]
        similar_business_rdd = sc.parallelize(
            (i[0], float(CosineSim(input_vec, i[1])))
            for i in all_business_vecs)
        similar_business_df = spark.createDataFrame(
            similar_business_rdd).withColumnRenamed(
                '_1', 'business_id').withColumnRenamed('_2', 'score').orderBy(
                    "score", ascending=False)
        similar_business_df = similar_business_df.filter(
            col("business_id") != b_id[0]).limit(10)
        similar_business_df = similar_business_df.withColumn(
            'input_business_id', lit(b_id[0]))
        # get restaurants similar to the user_id
        result = similar_businesses_df.union(similar_business_df)
    #result.cache()
    # filter out those have been reviewd before by the user
    d = [i[0] for i in usr_rev_bus.collect()]
    df_1 = result.filter(~(col('business_id').isin(d))).select(
        'business_id', 'score')
    #df_1= result.join(usr_rev_bus, 'business_id', 'left_outer').where(col("usr_rev_bus.business_id").isNull()).select([col('result.business_id'),col('result.score')])
    df_2 = df_1.orderBy("score", ascending=False).limit(sim_bus_limit)
    df_result = df_business.join(df_2, 'business_id',
                                 'right').select('business_id', 'score',
                                                 'name', 'categories',
                                                 'latitude', 'longitude')
    df_result.show()
    df_result = df_result.collect()
    return df_result
Exemple #34
0
# MAGIC %sql SELECT count(1), rating FROM reviews WHERE review LIKE '%great%' GROUP BY rating ORDER BY rating

# COMMAND ----------

# MAGIC %sql SELECT count(1), rating FROM reviews WHERE review LIKE '%poor%' GROUP BY rating ORDER BY rating

# COMMAND ----------

# MAGIC %md #NLP Pipeline

# COMMAND ----------

from pyspark.ml.feature import RegexTokenizer, StopWordsRemover, CountVectorizer

tokenizer = RegexTokenizer()    \
  .setInputCol("review")        \
  .setOutputCol("tokens")       \
  .setPattern("\\W+")

remover = StopWordsRemover()    \
  .setInputCol("tokens")        \
  .setOutputCol("stopWordFree") \

counts = CountVectorizer()      \
  .setInputCol("stopWordFree")  \
  .setOutputCol("features")     \
  .setVocabSize(1000)

# COMMAND ----------

from pyspark.ml.feature import Binarizer
Exemple #35
0
# <h3><b>6. Creating a Dataframe of input data and topic</b></h3>

# In[29]:


data = sqlContext.createDataFrame(dataList, ["index", "data","topic"])
data.show(5)


# <h3><b>7. Splitting data and converting into matrix of token counts</b></h3>

# In[35]:


# regular expression tokenizer
regexTokenizer = RegexTokenizer(inputCol="data", outputCol="words", pattern="\\W")
# stop words
add_stopwords = ["http","https","amp","rt","t","c","the"] 
stopwordsRemover = StopWordsRemover(inputCol="words", outputCol="filtered").setStopWords(add_stopwords)
# bag of words count
countVectors = CountVectorizer(inputCol="filtered", outputCol="features", vocabSize=10000, minDF=5)


# <b><h3>8. Creating pipeline of stages in the order of execution</h3></b>

# In[36]:


hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=10000)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
pipeline = Pipeline(stages=[regexTokenizer, stopwordsRemover, hashingTF, idf, label_stringIdx])
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
from pyspark.shell import spark
sentenceDataFrame = spark.createDataFrame(
    [(0, "Hi I heard about Spark"), (1, "I wish Java could use case classes"),
     (2, "Logistic,regression,models,are,neat")], ["id", "sentence"])

tokenizer = Tokenizer(inputCol="sentence", outputCol="words")

regexTokenizer = RegexTokenizer(inputCol="sentence",
                                outputCol="words",
                                pattern="\\W")
# alternatively, pattern="\\w+", gaps(False)

countTokens = udf(lambda words: len(words), IntegerType())

tokenized = tokenizer.transform(sentenceDataFrame)
tokenized.select("sentence", "words") \
    .withColumn("tokens", countTokens(col("words"))).show(truncate=False)

regexTokenized = regexTokenizer.transform(sentenceDataFrame)
regexTokenized.select("sentence", "words") \
    .withColumn("tokens", countTokens(col("words"))).show(truncate=False)
Exemple #37
0
          .withColumnRenamed('lowerText', 'text'))

print parsed.columns, '\n\n'
print parsed.select('text').first()

# COMMAND ----------

# MAGIC %md
# MAGIC Next, let's convert our text into a list of words so that we can perform some analysis at the word level.  For this we will use a feature transformer called `RegexTokenizer` which splits up strings into tokens (words in our case) based on a split pattern.  We'll split our text on anything that matches one or more non-word characters.

# COMMAND ----------

from pyspark.ml.feature import RegexTokenizer

tokenizer = (RegexTokenizer()
             .setInputCol('text')
             .setOutputCol('words')
             .setPattern('\\W+'))
wordsDF = tokenizer.transform(parsed)

# COMMAND ----------

wordsDF.select('words').first()

# COMMAND ----------

# MAGIC %md
# MAGIC There are some very common words in our list of words which won't be that useful for our later analysis.  We'll create a UDF to remove them.
# MAGIC  
# MAGIC [StopWordsRemover](http://spark.apache.org/docs/latest/ml-features.html#stopwordsremover) is implemented for Scala but not yet for Python.  We'll use the same [list](http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words) of stop words it uses to build a user-defined function (UDF).

# COMMAND ----------
    def train_preproecessing(self, content):

        ## Load files
        X_files = sc.textFile('gs://chatrath/files/X_train.txt')
        X_asm_files = X_files.map(lambda x:
                                  (("gs://chatrath/data/asm/" + x + ".asm")))
        X_asm_files = X_asm_files.reduce(lambda x, y: x + "," + y)
        X_asm = sc.wholeTextFiles(X_asm_files)
        X_train_asm = X_asm.mapValues(lambda x: re.sub("""[\t{Z}]""", "", x))
        X_train_asm = X_train_asm.mapValues(
            lambda x: re.sub("""[+{Z}]+""", "", x))
        X_train_asm = X_train_asm.mapValues(
            lambda x: re.sub("""[-{Z}]+""", "", x))
        X_train_asm = X_train_asm.mapValues(
            lambda x: re.sub("""[={Z}]+""", "", x))
        X_train_asm = X_train_asm.mapValues(
            lambda x: re.sub("""[\r|{Z}]+""", "", x))
        X_train_asm = X_train_asm.mapValues(
            lambda x: re.sub("""[;{Z}]+""", "", x))
        X_train_asm = X_train_asm.mapValues(
            lambda x: re.sub("""[\n{Z}]+""", "", x))
        X_train_asm = X_train_asm.mapValues(lambda x: x.split())

        ## Filter and Extract opcode
        X_train_asm = X_train_asm.mapValues(
            lambda x: list(filter(lambda y: y in content, x)))
        X_train_asm = X_train_asm.mapValues(lambda x: " ".join(map(str, x)))
        X_train_asm = X_train_asm.map(
            lambda x: (x[0].split("/")[-1].split(".")[0], x[1]))

        ## Create Dataframe
        asm_df = X_train_asm.map(
            lambda x: Row(filename=x[0], data=x[1])).toDF()

        ## Load lables
        rddy = sc.textFile("gs://chatrath/files/y_train.txt")

        ## Ensuring lables are in order
        X_files = X_files.zipWithIndex()
        rddy = rddy.zipWithIndex()
        dfy = rddy.map(
            lambda line: Row(label=line[0], id1=line[1]))  #(id,label)
        dfy = dfy.toDF()
        dfx = X_files.map(
            lambda line: Row(file=line[0], id=line[1])).toDF()  #(id,filename)
        resultantdf = dfx.alias('a').join(
            dfy.alias('b'),
            col('b.id1') == col('a.id')).drop('id').drop(
                'id1')  #(filename,label)
        resultantdf = resultantdf.join(
            asm_df, asm_df.filename == resultantdf.file).drop('file').drop(
                'filename')  #(data,label)

        ## Tokenizing data
        regexToken = RegexTokenizer(inputCol="data",
                                    outputCol="words",
                                    pattern="\\W")
        asm_df = regexToken.transform(resultantdf)
        asm_df = asm_df.drop('data')

        ## Using CountVectorizer to extract features
        countvector = CountVectorizer(inputCol="words", outputCol="features")
        cv = countvector.fit(asm_df)
        asm_df = cv.transform(asm_df)
        traindata = asm_df.withColumn('label',
                                      resultantdf['label'].cast('int'))

        return traindata, cv
Exemple #39
0
from pyspark.sql.functions import col

parsed = dfSmall.filter((col("title") != "<PARSE ERROR>") & col("redirect_title").isNull() & col("text").isNotNull())
parsed.take(1)

# COMMAND ----------

# MAGIC %md
# MAGIC Use a regular expression to tokenize (split into words).  Pattern defaults to matching the separator, but can be set to match tokens instead.

# COMMAND ----------

from pyspark.ml.feature import RegexTokenizer

tokenizer = RegexTokenizer().setInputCol("text").setOutputCol("words").setPattern("\\W+")

# COMMAND ----------

# MAGIC %md
# MAGIC Create a `HashingTF` transformer to hash words to buckets with counts, then use an `IDF` estimator to compute inverse-document frequency for buckets based on how frequently words have hashed to those buckets in the given documents.  Next, normalize the tf-idf values so that the \\( l^2 \\) norm is one for each row.

# COMMAND ----------

from pyspark.ml.feature import IDF, HashingTF, Normalizer

hashingTF = HashingTF().setNumFeatures(10000).setInputCol(tokenizer.getOutputCol()).setOutputCol("hashingTF")

idf = IDF().setMinDocFreq(10).setInputCol(hashingTF.getOutputCol()).setOutputCol("idf")

normalizer = Normalizer().setInputCol(idf.getOutputCol()).setOutputCol("features")