Python StringIndexer.fit Examples, pyspark.ml.feature.StringIndexer.fit Python Examples

Example #1

0

Show file

File: extraction2.py Project: ashishsjsu/Spark101

def mapClickCategoricalFeatures():
		

	indexed = ""

	df = getDataFrame(CLICKS_HDPFILEPATH)
	
	df.persist(StorageLevel.DISK_ONLY)

	print df.columns
	
	#select columns to be mapped
	click_cols = ["C2", "C3", "C4", "C5", "C7", "C8"]

	for col in click_cols:

		if(indexed == ""):	
			indexed = df
	
		print indexed
		outcol = col+"Index"
		indexer = StringIndexer(inputCol=col, outputCol=outcol)
		indexed = indexer.fit(indexed).transform(indexed)

	indexed.show()

	indexed.persist(StorageLevel.DISK_ONLY)

	#indexed.select('C0', 'C1', 'C2Index', 'C3Index', 'C4Index', 'C5Index', 'C6', 'C7Index', 'C8Index').write.format('com.databricks.spark.csv').save(PATH+"extraction/clicks1.csv")


	indexed.select('C0', 'C1', 'C2Index', 'C3Index', 'C4Index', 'C5Index', 'C6', 'C7Index', 'C8Index').write.format('com.databricks.spark.csv').save(HADOOPDIR+"data/click_fraud/extraction/clicks_23feb12.csv")

Example #2

0

Show file

File: classify.py Project: ApplyHiTech/DataScienceHW1

def train_random_forest(df):
    stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
    si_model = stringIndexer.fit(df)
    td = si_model.transform(df)
    rf = RandomForestClassifier(numTrees=3, maxDepth=2, labelCol="indexed",
                                seed=int(random.random()))
    return rf, rf.fit(td)

Example #3

0

Show file

File: spark.py Project: PranavGoel/Apache_Spark-MlLiB-Titanic-Kaggle-Competition

def build_decisionTree(path):

    df = load_data(path)
    avg_age=find_avg_age(df)
    df = data_preparation(df, avg_age)

    df = df.drop('Cabin')
    df = df.drop('Ticket')
    df = df.drop('Name')

    stringIndexer = StringIndexer(inputCol="Survived", outputCol="indexed")
    si_model = stringIndexer.fit(df)
    df = si_model.transform(df)
    df.show(truncate=False)

    dt = DecisionTreeClassifier(labelCol='indexed')
    grid = ParamGridBuilder().addGrid(dt.maxDepth, [1,2,3,5,6,8,10]).build()

    evaluator = BinaryClassificationEvaluator()
    cv = CrossValidator(estimator=dt, estimatorParamMaps=grid, evaluator=evaluator)
    cvModel = cv.fit(df)

    prediction = cvModel.transform(df)
    prediction.show(truncate=False)

    print "classification evaluation :" , evaluator.evaluate(prediction)

    return cvModel,avg_age

Example #4

0

Show file

File: spark.py Project: PranavGoel/Apache_Spark-MlLiB-Titanic-Kaggle-Competition

def build_randomForest(path):
    df = load_data(path)
    avg_age=find_avg_age(df)
    df = data_preparation(df, avg_age)

    df = df.drop('Cabin')
    df = df.drop('Ticket')
    df = df.drop('Name')

    stringIndexer = StringIndexer(inputCol="Survived", outputCol="indexed")
    si_model = stringIndexer.fit(df)
    df = si_model.transform(df)
    df.show()

    rdf = RandomForestClassifier(labelCol='indexed')
    grid = ParamGridBuilder().addGrid(rdf.maxDepth, [1,2,3,5,6,8,10])\
                            .addGrid(rdf.numTrees,[1,5,10,30,50,100,200]).build()

    evaluator = BinaryClassificationEvaluator()
    cv = CrossValidator(estimator=rdf, estimatorParamMaps=grid, evaluator=evaluator)
    cvModel = rdf.fit(df)

    prediction = cvModel.transform(df)
    prediction.show()

    print "classification evaluation :" , evaluator.evaluate(prediction)

    return cvModel,avg_age

Example #5

0

Show file

File: sc_classification.py Project: yokeyong/atap

def main(sc, spark):
    # Load and vectorize the corpus
    corpus = load_corpus(sc, spark)
    vector = make_vectorizer().fit(corpus)

    # Index the labels of the classification
    labelIndex = StringIndexer(inputCol="label", outputCol="indexedLabel")
    labelIndex = labelIndex.fit(corpus)

    # Split the data into training and test sets
    training, test = corpus.randomSplit([0.8, 0.2])

    # Create the classifier
    clf = LogisticRegression(
        maxIter=10, regParam=0.3, elasticNetParam=0.8,
        family="multinomial", labelCol="indexedLabel", featuresCol="tfidf")

    # Create the model
    model = Pipeline(stages=[
        vector, labelIndex, clf
    ]).fit(training)

    # Make predictions
    predictions = model.transform(test)
    predictions.select("prediction", "indexedLabel", "tfidf").show(5)

    # Select (prediction, true label) and compute test error
    evaluator = MulticlassClassificationEvaluator(
        labelCol="indexedLabel", predictionCol="prediction", metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("Test Error = %g" % (1.0 - accuracy))

    gbtModel = model.stages[2]
    print(gbtModel)  # summary only

Example #6

0

Show file

File: extraction2.py Project: ashishsjsu/Spark101

def mapPublisherCategoricalFeatures():
	
	indexed = ""

	df = getDataFrame(PUBLISHERS_HDPFILEPATH)

	df.persist(StorageLevel.DISK_ONLY)

	print df.columns
	
	publisher_cols = ["C0", "C1", "C2", "C3"]
	
	for col in publisher_cols:

		if(indexed == ""):	
			indexed = df

		print indexed
		outcol = col+"Index"
		#stringindexer maps each value in inout colun into a double indexed value and creates a new column in dataframe
		indexer = StringIndexer(inputCol=col, outputCol=outcol)
		#fit and transform the columns using indexer		
		indexed = indexer.fit(indexed).transform(indexed)

	indexed.show()

	indexed.persist(StorageLevel.DISK_ONLY)

	indexed.select('C0Index', 'C1Index', 'C2Index', "C3Index").write.format('com.databricks.spark.csv').save(HADOOPDIR+"data/click_fraud/extraction/publishers_23feb12.csv")

Example #7

0

Show file

File: credit_prediction.py Project: WeihuaLei/LearnSpark

def testClassification(data):
    # Train a GradientBoostedTrees model.

    stringIndexer = StringIndexer(inputCol="label", outputCol="indexLabel")
    si_model = stringIndexer.fit(data)
    td = si_model.transform(data)

    rf = RandomForestClassifier(numTrees=5, maxDepth=4, labelCol="indexLabel",seed=13)

    trainData,testData = td.randomSplit([0.8,0.2],13)

    predictionDF = rf.fit(trainData).transform(testData)

    selected = predictionDF\
        .select('label','indexLabel','prediction','rawPrediction','probability')
    for row in selected.collect():
        print row

    scoresAndLabels = predictionDF\
       .map(lambda x: (float(x.probability.toArray()[1]), x.indexLabel))
    for sl in scoresAndLabels.collect():
        print sl
    evaluator = BinaryClassificationEvaluator(labelCol='indexLabel',metricName='areaUnderROC')
    metric = evaluator.evaluate(selected)
    print metric

Example #8

0

Show file

File: ml.py Project: ribonj/lsir

def label(df, column):
    """
    Create a labeled column.
    """
    indexer = StringIndexer(inputCol=column, outputCol=column+'_label')
    df = indexer.fit(df).transform(df)
    return df

Example #9

0

Show file

File: Session6.py Project: raul-arrabales/Spark-Hands-on

def indexStringColumns(df, cols):
    #variable newdf will be updated several times
    newdata = df
    for c in cols:
        si = StringIndexer(inputCol=c, outputCol=c+"-x")
        sm = si.fit(newdata)
        newdata = sm.transform(newdata).drop(c)
        newdata = newdata.withColumnRenamed(c+"-x", c)
    return newdata

Example #10

0

Show file

File: GdeltDecisionTree.py Project: liber-pater/ProjectThales

def events(df,column_name):
    i = column_name+"I"
    v = column_name+"V"
    stringIndexer = StringIndexer(inputCol=column_name, outputCol=i)
    model = stringIndexer.fit(df)
    indexed = model.transform(df)
    encoder = OneHotEncoder(inputCol=i, outputCol=v)
    encoded = encoder.transform(indexed)
    return encoded

Example #11

0

Show file

File: ch08-listings.py Project: AkiraKane/first-edition

def indexStringColumns(df, cols):
    from pyspark.ml.feature import StringIndexer
    #variable newdf will be updated several times
    newdf = df
    for c in cols:
        si = StringIndexer(inputCol=c, outputCol=c+"-num")
        sm = si.fit(newdf)
        newdf = sm.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-num", c)
    return newdf

Example #12

0

Show file

File: anomaly_detection.py Project: gitofsid/MyBigDataCode

    def oneHotEncoding(self, df, input_col):
        stringInd = StringIndexer(inputCol=input_col, outputCol="indexed")
        model = stringInd.fit(df)
        td = model.transform(df)
        encoder = OneHotEncoder(inputCol="indexed", outputCol="features", dropLast=False)
        final_encoding = encoder.transform(td).select(df.id, 'features').cache()
        
        conv_udf = udf(lambda line: Vectors.dense(line).tolist())
        final_encoding = final_encoding.select(df.id,conv_udf(final_encoding.features).alias("num_"+input_col)).cache()

        return final_encoding

Example #13

0

Show file

File: test_feature.py Project: Brett-A/spark

    def test_string_indexer_handle_invalid(self):
        df = self.spark.createDataFrame([
            (0, "a"),
            (1, "d"),
            (2, None)], ["id", "label"])

        si1 = StringIndexer(inputCol="label", outputCol="indexed", handleInvalid="keep",
                            stringOrderType="alphabetAsc")
        model1 = si1.fit(df)
        td1 = model1.transform(df)
        actual1 = td1.select("id", "indexed").collect()
        expected1 = [Row(id=0, indexed=0.0), Row(id=1, indexed=1.0), Row(id=2, indexed=2.0)]
        self.assertEqual(actual1, expected1)

        si2 = si1.setHandleInvalid("skip")
        model2 = si2.fit(df)
        td2 = model2.transform(df)
        actual2 = td2.select("id", "indexed").collect()
        expected2 = [Row(id=0, indexed=0.0), Row(id=1, indexed=1.0)]
        self.assertEqual(actual2, expected2)

Example #14

0

Show file

Instructions:
Import the appropriate class and create an indexer object to transform the carrier column from a string to an numeric index.
Prepare the indexer object on the flight data.
Use the prepared indexer to create the numeric index column.
Repeat the process for the org column.

"""

from pyspark.ml.feature import StringIndexer

# Create an indexer
indexer = StringIndexer(inputCol='carrier', outputCol='carrier_idx')

# Indexer identifies categories in the data
indexer_model = indexer.fit(flights)

# Indexer creates a new column with numeric index values
flights_indexed = indexer_model.transform(flights)

# Repeat the process for the other categorical feature
flights_indexed = StringIndexer(
    inputCol='org',
    outputCol='org_idx').fit(flights_indexed).transform(flights_indexed)
"""
Assembling columns
The final stage of data preparation is to consolidate all of the predictor columns into a single column.

At present our data has the following predictor columns:

mon, dom and dow

Example #15

0

Show file

irisNew_DF.show(5)

# In[19]:

#transforming dataframe by assigning labelIndex to every class of flower by using StringIndexer.

#StringIndexer encodes a string column of labels to a column of label indices and can encode multiple columns.

classlabel_indexer = StringIndexer(inputCol="label", outputCol="labelIndex")

# In[20]:

#applying above StringIndexer transformation to the dataFrame

irisIndexer_DF = classlabel_indexer.fit(irisNew_DF).transform(irisNew_DF)

# In[21]:

#displaying first 5 records after StringIndexer transformation

irisIndexer_DF.show(5)

# In[22]:

#defining logistic regression classifier model

logReg_model = LogisticRegression(labelCol="labelIndex",
                                  featuresCol="features",
                                  maxIter=100,
                                  regParam=0.001,

Example #16

0

Show file

]
nazwy = ["Airline1_Back", 'Airline2_There', 'Airline2_Back', 'Airline1_There']

for country_from in country_list:
    for country_to in country_list:
        print("Country from: ", country_from, " Country to: ", country_to)
        try:
            df2 = df.filter(df.Country_from == country_from).filter(
                df.Country_to == country_to)
            df_temp = df2.select(df2.Scrap_time.cast("float"),'Airline1_Back','Airline2_There','Airline2_Back'\
                             ,'Airline1_There',df2.Days.cast("float"),df2.Journey_time.cast("float"), df2.Full_Price.cast("float"))

            for nazwa in nazwy:
                indexer = StringIndexer(inputCol=nazwa,
                                        outputCol=nazwa + "Index")
                df_temp = indexer.fit(df_temp).transform(df_temp)

            df_temp = df_temp.select('Airline1_BackIndex','Airline2_ThereIndex','Airline2_BackIndex','Airline1_ThereIndex','Scrap_time',\
                   'Days','Journey_time', 'Full_Price')
            transformed = transData(df_temp)

            test = transformed.rdd.map(lambda row: LabeledPoint(
                row['label'], row['features'].toArray()))
            model = RandomForest.trainRegressor(test,
                                                categoricalFeaturesInfo={},
                                                numTrees=30,
                                                featureSubsetStrategy="auto",
                                                impurity='variance',
                                                maxDepth=4,
                                                maxBins=32)

Example #17

0

Show file

File: Handling Categorical Data Tutorial.py Project: lcs-grtsch/Machine-Learning

# # Register data
# spark_flights.createOrReplaceTempView("flights_temp")
# # Data should appear
# print(spark.catalog.listTables())


################
# Spark StringIndexer()
################

# Only load carrier column
carrier_df = spark_flights.select("carrier")
# carrier_df.show(5)
# Spark method of indexing string values with numerical values
# Set up StringIndexer()
carr_indexer = StringIndexer(inputCol="carrier", outputCol="carrier_index")
# Transform data
carr_indexed = carr_indexer.fit(carrier_df).transform(carrier_df)
# carr_indexed.show(7)

# Do a OneHotEncoder first and then add StringIndexer
carrier_df_onehot = spark_flights.select("carrier")

stringIndexer = StringIndexer(inputCol="carrier", outputCol="carrier_index")
model = stringIndexer.fit(carrier_df_onehot)
indexed = model.transform(carrier_df_onehot)
encoder = OneHotEncoder(dropLast=False, inputCol="carrier_index", outputCol="carrier_vec")
encoded = encoder.transform(indexed)

encoded.show(7)

Example #18

0

Show file

from pyspark.sql.types import StructField, StructType, LongType, StringType, IntegerType
import pyspark.sql.functions as F
from pyspark import SparkContext, SparkConf
from tqdm import tqdm
from itertools import permutations
from collections import defaultdict
import time
from pyspark.ml.feature import StringIndexer

spark = SparkSession.builder.appName("tag base on spark").master("local[8]").getOrCreate()

sc = spark.sparkContext

schema = StructType([StructField('userId', IntegerType(), True), 
            StructField('movieId', IntegerType(), True), 
            StructField('rating', LongType(), True), 
            StructField('timestamp', IntegerType(), True)])
tags = spark.read.csv(r'D:\Users\hao.guo\deepctr\recsys\movielen\ml-20m\tags.csv', header=True)

index = StringIndexer(inputCol='tag', outputCol='tagid')
model = index.fit(tags)
tags = model.transform(tags)
tags = tags.withColumn('tagid', tags['tagid'].cast('int'))
tags_rdd = tags.select(['userId', 'movieId', 'tagid']).rdd

train_rdd, test_rdd = tags_rdd.randomSplit([0.7, 0.3], seed=2020)
train_rdd = train_rdd.cache()
test_rdd = test_rdd.cache()

train_rdd = train_rdd.map(lambda s: (s, 1)).

Example #19

0

Show file

File: user_item_affinity_calculator.py Project: dpatchigolla/ml_learning

    def build_recommendation_model(self):
        logging.info("getting distinct users")
        print_with_time("getting distinct users")
        users = self.df.select(["user_id"]).distinct()

        logging.info("getting distinct items")
        print_with_time("getting distinct items")
        items = self.df.select(["item_id"]).distinct()

        logging.info("mapping user_id to number")
        print_with_time("mapping user_id to number")
        user_indexer = StringIndexer(inputCol="user_id",
                                     outputCol="user_id_no")
        self.user_indexed = user_indexer.fit(users).transform(users)
        self.user_indexed = self.user_indexed.select(
            self.user_indexed.user_id.cast("string"),
            self.user_indexed.user_id_no.cast("int"))

        logging.info("mapping item_id to number")
        print_with_time("mapping item_id to number")
        item_indexer = StringIndexer(inputCol="item_id",
                                     outputCol="item_id_no")
        self.item_indexed = item_indexer.fit(items).transform(items)
        self.item_indexed = self.item_indexed.select(
            self.item_indexed.item_id.cast("string"),
            self.item_indexed.item_id_no.cast("int"))

        logging.info("joining df with user_indexed rdd")
        print_with_time("joining df with user_indexed rdd")
        self.df = self.df.join(self.user_indexed, ["user_id"], 'inner')

        logging.info("joining df with item_indexed rdd")
        print_with_time("joining df with item_indexed rdd")
        self.df = self.df.join(self.item_indexed, ["item_id"], 'inner')
        self.df = self.df.select(["item_id_no", "user_id_no", "rating"])

        ############

        logging.info("splitting dataset into training and testing")
        print_with_time("splitting dataset into training and testing")
        (training, validation, test) = self.df.randomSplit([0.6, 0.2, 0.2])

        ######

        ranks = [25, 50, 100]
        regParam = [0.1, 0.01, 0.001]
        all_params = [(rank, reg) for rank in ranks for reg in regParam]

        min_mpr = float('inf')
        best_rank = -1
        best_reg = -1
        for (iteration_no, (rank, reg)) in enumerate(all_params):

            logging.info(iteration_no)
            print_with_time(str(iteration_no))
            logging.info("rank=%s, reg=%s " % (rank, reg))
            print_with_time("rank=%s, reg=%s " % (rank, reg))

            als = ALS(rank=rank,
                      regParam=reg,
                      nonnegative=True,
                      implicitPrefs=True,
                      userCol="user_id_no",
                      itemCol="item_id_no",
                      checkpointInterval=-1,
                      coldStartStrategy="drop",
                      ratingCol="rating")
            self.model = als.fit(training)

            logging.info("transforming the validation set")
            print_with_time("transforming the validation set")
            predictions = self.model.transform(validation)

            logging.info("getting rmse on validation set")
            print_with_time("getting rmse on validation set")

            evaluator = RegressionEvaluator(metricName="rmse",
                                            labelCol="rating",
                                            predictionCol="prediction")
            rmse = evaluator.evaluate(predictions)
            logging.info("Root-mean-square error = " + str(rmse))
            print_with_time("Root-mean-square error = " + str(rmse))

            logging.info("getting MPR on validation set")
            print_with_time("getting MPR on validation set")

            ev = RankBasedEvaluator2("user_id_no", "rating", "prediction")
            mpr = ev.evaluate(sqlContext, predictions)
            logging.info("Mean Percentile Ranking = " + str(mpr))
            print_with_time("Mean Percentile Ranking = " + str(mpr))

            if mpr < min_mpr:
                min_mpr = mpr
                best_rank = rank
                best_reg = reg

        logging.info('The best model was trained with rank %s and reg %s' %
                     (best_rank, best_reg))
        print_with_time('The best model was trained with rank %s and reg %s' %
                        (best_rank, best_reg))

        ######

        logging.info("starting model training")
        print_with_time("starting model training")

        als = ALS(rank=best_rank,
                  regParam=best_reg,
                  nonnegative=True,
                  implicitPrefs=True,
                  userCol="user_id_no",
                  itemCol="item_id_no",
                  checkpointInterval=-1,
                  coldStartStrategy="drop",
                  ratingCol="rating")
        self.model = als.fit(training)

        logging.info("transforming the test set")
        print_with_time("transforming the test set")
        predictions = self.model.transform(test)

        logging.info("getting rmse on test set")
        print_with_time("getting rmse on test set")

        evaluator = RegressionEvaluator(metricName="rmse",
                                        labelCol="rating",
                                        predictionCol="prediction")
        rmse = evaluator.evaluate(predictions)
        logging.info("Root-mean-square error = " + str(rmse))
        print_with_time("Root-mean-square error = " + str(rmse))

        logging.info("getting MPR on test set")
        print_with_time("getting MPR on test set")
        ev = RankBasedEvaluator2("user_id_no", "rating", "prediction")
        mpr = ev.evaluate(sqlContext, predictions)
        logging.info("Mean Percentile Ranking = " + str(mpr))
        print_with_time("Mean Percentile Ranking = " + str(mpr))

Example #20

0

Show file

File: Covid-19.py Project: evanditter/spark-covid19-research

# will be creating a new ML dataframe from the combined dataframe above
from pyspark.ml.feature import StringIndexer

interested_cols_ML = spark.sql(
    """SELECT DISTINCT state, date, restriction_end_date_of_april28, religious_restrictions, current_restrictions, m50, m50_index, confirmed_cases AS cases, fatalities AS fatalities, (confirmed_cases / current_population) AS cases_density, (fatalities / current_population) AS fatality_density FROM combined ORDER BY state, date"""
)
interested_cols_ML.createOrReplaceTempView("interested_cols_ML")
interested_cols_ML.show(3)

# COMMAND ----------

# turning string categorical variables back into integers
lblIndxr = StringIndexer().setInputCol("religious_restrictions").setOutputCol(
    "label_religious_rest")
idxRes = lblIndxr.fit(interested_cols_ML).transform(interested_cols_ML)
lblIndxr2 = StringIndexer().setInputCol("current_restrictions").setOutputCol(
    "label_curr_rest")
idxRes2 = lblIndxr2.fit(idxRes).transform(idxRes)
# tried using for loop, ended up taking just as long
# indexers = [StringIndexer(inputCol=column, outputCol=column+"_label").fit(interested_cols_ML).transform(interested_cols_ML) for column in interested_cols_ML.columns if "_restrictions" in column ]

# COMMAND ----------

cols_drop = [
    'fatality_density', 'religious_restrictions', 'current_restrictions'
]
final_cols_df = idxRes2.drop(*cols_drop)
final_cols_df.show(3)

# COMMAND ----------

Example #21

0

Show file

File: reglog_nocv_simple.py Project: pifouuu/ProjetBigData


from pyspark.mllib.linalg import VectorUDT
from pyspark.sql.types import StructType, StructField,DoubleType

schema = StructType([StructField('label',DoubleType(),True),StructField('Vectors',VectorUDT(),True)])


features=dfTrainTok.map(partial(vectorize,dico=dict_broad.value)).toDF(schema)

print "Features created"

from pyspark.ml.feature import StringIndexer

string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
string_indexer_model = string_indexer.fit(features)
featIndexed = string_indexer_model.transform(features)

print "labels indexed"

lr = LogisticRegression(featuresCol='Vectors', labelCol=string_indexer.getOutputCol())

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision')

lr_model = lr.fit(featIndexed)

dfTestTok = tokenizer.transform(dfTest)
featuresTest=dfTestTok.map(partial(vectorize,dico=dict_broad.value)).toDF(schema)
testIndexed = string_indexer_model.transform(featuresTest)

Example #22

0

Show file

#The features column will now contain values from all the input columns identified
assemble_apply=Vec_assembler.transform(a)

# assemble_apply.show(10)

#Since we have integrated the defining features into the features above we don't require them any further so we can remove them from our df
assembly_final=assemble_apply.drop('sepal_length','sepal_width','petal_length','petal_width')

# assembly_final.show(5)

#Adding a labelindex for our defining class which in this case is species.
#We would get the label index as 0 1 2 depending on the frequency of occurence with 0 being awarded to the highest occuring species
label=StringIndexer(inputCol='species',outputCol='label')

si_dataset_fit=label.fit(assembly_final).transform(assembly_final)
# si_dataset_fit.show(5)

#We are now dividing our data set into two parts. the training and the test to see how accurate our model is
#Going with the 80-20 split here. 80 for training and 20 for testing
train_data,test_data=si_dataset_fit.randomSplit([0.8,0.2])

#Using logistic regression

rg=0.03 #Can be changed depending on what value yield more accurate results.

lr =LogisticRegression(featuresCol='features',labelCol='label',regParam=rg)
model = lr.fit(train_data) #Fitting the data for linear regression

#Actually seeing how it is performing using by testing it using the test data
prediction=model.transform(test_data)

Example #23

0

Show file

df.count()

# ### a. Prepare in Input Features
#

# First, you will need to prepare each of the input features. While age is a numeric feature, state and name are not. These need to be converted into numeric vectors before you can train the model. Use a StringIndexer along with the OneHotEncoderEstimator to convert the name, state, and sex columns into numeric vectors. Use the VectorAssembler to combine the name, state, and age vectors into a single features vector. Your final dataset should contain a column called features containing the prepared vector and a column called label containing the sex of the person.
#
#

#  #### Use a StringIndexer along with the OneHotEncoderEstimator to convert the name, state, and sex columns into numeric vectors

# In[6]:

name_indexer = StringIndexer(inputCol="name", outputCol="nameInd")
name_trsf = name_indexer.fit(df).transform(df)  # transform(df.select("name"))
name_ohe = OneHotEncoder(inputCol="nameInd", outputCol="name_ohe")
name_featurevect = name_ohe.transform(name_trsf)

# In[7]:

name_featurevect

# In[8]:

state_indexer = StringIndexer(inputCol="state", outputCol="stateInd")
state_trsf = state_indexer.fit(name_featurevect).transform(
    name_featurevect)  # transform(df.select("state"))
state_ohe = OneHotEncoder(inputCol="stateInd", outputCol="state_ohe")
state_featurevect = state_ohe.transform(state_trsf)

Example #24

0

Show file

File: H3.py Project: chuanchuanchuan/Data_analytics

    return result


conf = SparkConf().setMaster("local").setAppName("My App")
sc = SparkContext(conf=conf)
sqlContext = SQLContext(sc)
data = sc.textFile("/home/bigdatalab28/test.sql")
data = data.filter(lambda line: line != '')
data = data.map(lambda line: line.split("\t"))
schemaVal = data.map(lambda x: (x[3], x[7], x[9])).map(
    lambda x: Row(label_0=x[0], birth_city=x[1], id_city=x[2]))
schemaVal = sqlContext.createDataFrame(schemaVal)
(train_data, valid_data, test_data) = schemaVal.randomSplit([0.7, 0.1, 0.2],
                                                            123)
indexer = StringIndexer(inputCol="label_0", outputCol="label")
indexed = indexer.fit(train_data).transform(train_data)
indexer = StringIndexer(inputCol="birth_city", outputCol="bc")
indexed = indexer.fit(indexed).transform(indexed)
indexer = OneHotEncoder(inputCol="bc", outputCol="bc_one")
indexed = indexer.transform(indexed)
indexer = StringIndexer(inputCol="id_city", outputCol="ic")
indexed = indexer.fit(indexed).transform(indexed)
indexer = OneHotEncoder(inputCol="ic", outputCol="ic_one")
indexed = indexer.transform(indexed)
assembler = VectorAssembler(inputCols=["ic_one", "bc_one"],
                            outputCol="features")
train = assembler.transform(indexed)
nb = NaiveBayes(smoothing=1.0)
model = nb.fit(train)
indexer = StringIndexer(inputCol="label_0", outputCol="label")
indexed = indexer.fit(train_data).transform(train_data)

Example #25

0

Show file

File: train_spark_mllib_model.py Project: dataAlgorithms/data

# Check the buckets out
ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show()

#
# Extract features tools in with pyspark.ml.feature
#
from pyspark.ml.feature import StringIndexer, VectorAssembler

# Turn category fields into categoric feature vectors, then drop intermediate fields
for column in ["Carrier", "DayOfMonth", "DayOfWeek", "DayOfYear",
               "Origin", "Dest", "Route"]:
  string_indexer = StringIndexer(
    inputCol=column,
    outputCol=column + "_index"
  )
  ml_bucketized_features = string_indexer.fit(ml_bucketized_features)\
                                          .transform(ml_bucketized_features)

# Check out the indexes
ml_bucketized_features.show(6)

# Handle continuous, numeric fields by combining them into one feature vector
numeric_columns = ["DepDelay", "Distance"]
index_columns = ["Carrier_index", "DayOfMonth_index",
                   "DayOfWeek_index", "DayOfYear_index", "Origin_index",
                   "Origin_index", "Dest_index", "Route_index"]
vector_assembler = VectorAssembler(
  inputCols=numeric_columns + index_columns,
  outputCol="Features_vec"
)
final_vectorized_features = vector_assembler.transform(ml_bucketized_features)

Example #26

0

Show file

File: spark_pipeline.py Project: aalepere/ds_ci_cd

from pyspark.sql.functions import expr

sc = SparkContext("local", "Spark Pipeline")
sqlContext = SQLContext(sc)

df = sqlContext.read.csv("../data/titanic.csv",
                         sep="\t",
                         header=True,
                         inferSchema=True)
train, test = df.randomSplit([0.7, 0.3], seed=12345)

mapping = sqlContext.createDataFrame([(0, "male"), (1, "female")],
                                     ["id", "category"])

indexer = StringIndexer(inputCol="Sex", outputCol="SexIndex")
train = indexer.fit(train).transform(train)
train.show()

percentiles = train.approxQuantile("Fare", [0.01, 0.99], 0.01)

winsorize = expr("""IF(Fare >= {}, {},IF(Fare <= {},{},Fare))""".format(
    percentiles[0], percentiles[0], percentiles[1], percentiles[1]))

train.withColumn("Fare", winsorize)
train.show()

imputer = Imputer(inputCols=["Age", "Fare"],
                  outputCols=["out_Age", "out_Fare"]).setStrategy("median")
train = imputer.fit(train).transform(train)
train.show()

Example #27

0

Show file

File: script4.py Project: pifouuu/ProjetBigData

print "Creating feature vectors"
t0 = time()
dfTrainVec=dfTrain.map(partial(vectorize,dicoUni=dict_broad.value,dicoTri=dictTri_broad.value)).toDF(schema)
dfTestVec=dfTest.map(partial(vectorize,dicoUni=dict_broad.value,dicoTri=dictTri_broad.value)).toDF(schema)
tt = time() - t0
print "Dataframe created in {} second".format(round(tt,3))


# In[19]:

print "Indexing labels"
t0 = time()
from pyspark.ml.feature import StringIndexer
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
string_indexer_model = string_indexer.fit(dfTrainVec)
dfTrainIdx = string_indexer_model.transform(dfTrainVec)
dfTrainIdx.take(1)
tt = time() - t0
print "Done in {} second".format(round(tt,3))


# In[20]:

from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(featuresCol='featureVectors', labelCol='target_indexed', maxDepth=10)


# In[21]:

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

Example #28

0

Show file

full_rdd = sub_acc.union(sub_noacc)
full_rdd.cache()


#create a dataframe for encoding categorical variables
df = sqlContext.createDataFrame(full_rdd) #complete dataset
#df = sqlContext.createDataFrame(full_rdd.sample(withReplacement=False,fraction=0.25,seed=seed)) #let's start with a 1/4 of the data


# ## Convert categorical features
# 
# The following cells train a Spark StringIndexer to index the zip codes in the data set.

#define categorical indexers for the data
zipIndexer =  StringIndexer(inputCol='grid_zipcode', outputCol='grid_zipcodeIdx')#,handleInvalid='skip')
zipIdxModel = zipIndexer.fit(df)
indexed = zipIdxModel.transform(df)

indexed.cache()
#zipEncoder = OneHotEncoder(dropLast=False, inputCol="grid_zipcodeIdx", outputCol="grid_zipcodeVec")
#zipEncoded = zipEncoder.transform(td1)

#save the zip code labels for rewriting predictions to Elasticsearch index
zipCodeLables = zipIdxModel._call_java("labels")
zipKey = {i:zipCodeLables[i] for i in range(len(zipCodeLables))}


# ## Labeled Points
# 
# Spark MLLib algorithms take LabeledPoints, a special object tuple of (label, [features]). Before training the model we will run a simple map job to convert the SparkSQL DataFrame rows to LabeledPoints.

Example #29

0

Show file

File: train.py Project: aymen82/SparkImageRecognition

    features = sqc.read.parquet(input_features)
    features = features.filter(features['cls']!='None')\
                        .select(['cls', 'features'])\
                        .cache()
    print features

    features = sqc.createDataFrame(features.map(normalizer))
    print features


    training, valid = features.randomSplit([0.75, 0.25])

    labelIndexer = StringIndexer(inputCol="cls", outputCol="label")

    model = labelIndexer.fit(training)
    training = model.transform(training).rdd.map(lambda row: LabeledPoint(row.label, row.features))
    valid = model.transform(valid).rdd.map(lambda row: LabeledPoint(row.label, row.features))

    print training.first()
    #lr = LogisticRegression()
    #pipeline = Pipeline(stages=[labelIndexer,lr])



    # fit
    model = LogisticRegressionWithLBFGS.train(training, numClasses=10)

    #model = pipeline.fit(training)
    # ecaluate
    #evaluator = BinaryClassificationEvaluator(metricName="areaUnderROC")

Example #30

0

Show file

spark = SparkSession.builder.appName('treecode').getOrCreate()
data = spark.read.csv('College.csv', inferSchema=True, header=True)
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler

assembler = VectorAssembler(inputCols=[
    'Apps', 'Accept', 'Enroll', 'Top10perc', 'Top25perc', 'F_Undergrad',
    'P_Undergrad', 'Outstate', 'Room_Board', 'Books', 'Personal', 'PhD',
    'Terminal', 'S_F_Ratio', 'perc_alumni', 'Expend', 'Grad_Rate'
],
                            outputCol="features")
output = assembler.transform(data)
from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="Private", outputCol="PrivateIndex")
output_fixed = indexer.fit(output).transform(output)
final_data = output_fixed.select("features", 'PrivateIndex')
train_data, test_data = final_data.randomSplit([0.7, 0.3])
from pyspark.ml.classification import DecisionTreeClassifier, GBTClassifier, RandomForestClassifier
from pyspark.ml import Pipeline
dtc = DecisionTreeClassifier(labelCol='PrivateIndex', featuresCol='features')
rfc = RandomForestClassifier(labelCol='PrivateIndex', featuresCol='features')
gbt = GBTClassifier(labelCol='PrivateIndex', featuresCol='features')
dtc_model = dtc.fit(train_data)
rfc_model = rfc.fit(train_data)
gbt_model = gbt.fit(train_data)

dtc_predictions = dtc_model.transform(test_data)
rfc_predictions = rfc_model.transform(test_data)
gbt_predictions = gbt_model.transform(test_data)
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

Example #31

0

Show file

File: feature_engg.py Project: adi07tya/Spark_with_Python

from pyspark.ml.feature import StringIndexer

df = spark.createDataFrame([(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"),
                            (5, "c")], ["user_id", "category"])

indexer = StringIndexer(inputCol='category', outputCol='categoryIndex')
indexed = indexer.fit(df).transform(df)
indexed.show()

from pyspark.ml.linalg import Vectors
from pyspark.ml.linalg import VectorAssembler

df = spark.createDataFrame(
    [(0, 18, 1.0, Vectors.dense([0.0, 10.0, 0.5]), 1.0)],
    ["id", "hour", "mobile", "userFeatures", "clicked"])
df.show()

assembler = VectorAssembler(inputCols=["hour", "mobile", "userFeatures"],
                            outputCol="features")

output = assembler.transform(df)
print(
    "Assembled columns 'hour', 'mobile', 'userFeatures' to vector column 'features'"
)
output.select("features", "clicked").show()

Example #32

0

Show file

File: 400sample_07kmeans.py Project: pamulapati/cdswdemo

# create a table name to use for queries
#dfpfc.createOrReplaceTempView("census07")
# run a query
#fcout=myspark.sql('select * from census07 where salary > 100000')
#fcout.show(5)
# create a dataframe with valid rows
mydf=myspark.sql('select code as txtlabel, salary, total_emp from sample_07 where total_emp > 0 and total_emp< 1000000 and salary >0 and salary<500000' )
mydf.show(5)

# need to convert from text field to numeric
# this is a common requirement when using sparkML
from pyspark.ml.feature import StringIndexer

# this will convert each unique string into a numeric
indexer = StringIndexer(inputCol="txtlabel", outputCol="label")
indexed = indexer.fit(mydf).transform(mydf)
indexed.show(5)
# now we need to create  a  "label" and "features"
# input for using the sparkML library

from pyspark.ml.feature import VectorAssembler
from pyspark.ml.linalg import Vectors

assembler = VectorAssembler(
    inputCols=[ "total_emp","salary"],
    outputCol="features")
output = assembler.transform(indexed)
# note the column headers - label and features are keywords
print ( output.show(3) )

# use the kmeans clustering - do not write it yourself :-)

Example #33

0

Show file

def main(base_path):

    # Default to "."
    try:
        base_path
    except NameError:
        base_path = "."
    if not base_path:
        base_path = "."

    APP_NAME = "train_spark_mllib_model.py"

    # If there is no SparkSession, create the environment
    try:
        sc and spark
    except (NameError, UnboundLocalError) as e:
        import findspark
        findspark.init()
        import pyspark
        import pyspark.sql

        sc = pyspark.SparkContext()
        spark = pyspark.sql.SparkSession(sc).builder.appName(
            APP_NAME).getOrCreate()

    #
    # {
    #   "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00",
    #   "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0,
    #   "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS"
    # }
    #
    from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType
    from pyspark.sql.types import StructType, StructField
    from pyspark.sql.functions import udf

    schema = StructType([
        StructField("ArrDelay", DoubleType(), True),  # "ArrDelay":5.0
        StructField("CRSArrTime", TimestampType(),
                    True),  # "CRSArrTime":"2015-12-31T03:20:00.000-08:00"
        StructField("CRSDepTime", TimestampType(),
                    True),  # "CRSDepTime":"2015-12-31T03:05:00.000-08:00"
        StructField("Carrier", StringType(), True),  # "Carrier":"WN"
        StructField("DayOfMonth", IntegerType(), True),  # "DayOfMonth":31
        StructField("DayOfWeek", IntegerType(), True),  # "DayOfWeek":4
        StructField("DayOfYear", IntegerType(), True),  # "DayOfYear":365
        StructField("DepDelay", DoubleType(), True),  # "DepDelay":14.0
        StructField("Dest", StringType(), True),  # "Dest":"SAN"
        StructField("Distance", DoubleType(), True),  # "Distance":368.0
        StructField("FlightDate", DateType(),
                    True),  # "FlightDate":"2015-12-30T16:00:00.000-08:00"
        StructField("FlightNum", StringType(), True),  # "FlightNum":"6109"
        StructField("Origin", StringType(), True),  # "Origin":"TUS"
    ])

    input_path = "{}/data/simple_flight_delay_features.jsonl.bz2".format(
        base_path)
    features = spark.read.json(input_path, schema=schema)
    features.first()

    #
    # Check for nulls in features before using Spark ML
    #
    null_counts = [(column, features.where(features[column].isNull()).count())
                   for column in features.columns]
    cols_with_nulls = filter(lambda x: x[1] > 0, null_counts)
    print(list(cols_with_nulls), flush=True)

    #
    # Add a Route variable to replace FlightNum
    #
    from pyspark.sql.functions import lit, concat
    features_with_route = features.withColumn(
        'Route', concat(features.Origin, lit('-'), features.Dest))
    features_with_route.show(6)

    #
    # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2)
    #
    from pyspark.ml.feature import Bucketizer

    # Setup the Bucketizer
    splits = [-float("inf"), -15.0, 0, 30.0, float("inf")]
    arrival_bucketizer = Bucketizer(splits=splits,
                                    inputCol="ArrDelay",
                                    outputCol="ArrDelayBucket")

    # Save the bucketizer
    arrival_bucketizer_path = "{}/arrival_bucketizer_2.0.bin".format(
        MODELS_MOUNTPATH)
    arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path)

    # Apply the bucketizer
    ml_bucketized_features = arrival_bucketizer.transform(features_with_route)
    ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show()

    #
    # Extract features tools in with pyspark.ml.feature
    #
    from pyspark.ml.feature import StringIndexer, VectorAssembler

    # Turn category fields into indexes
    for column in ["Carrier", "Origin", "Dest", "Route"]:
        string_indexer = StringIndexer(inputCol=column,
                                       outputCol=column + "_index")

        string_indexer_model = string_indexer.fit(ml_bucketized_features)
        ml_bucketized_features = string_indexer_model.transform(
            ml_bucketized_features)

        # Drop the original column
        ml_bucketized_features = ml_bucketized_features.drop(column)

        # Save the pipeline model
        string_indexer_output_path = "{}/string_indexer_model_{}.bin".format(
            MODELS_MOUNTPATH, column)
        string_indexer_model.write().overwrite().save(
            string_indexer_output_path)

    # Combine continuous, numeric fields with indexes of nominal ones
    # ...into one feature vector
    numeric_columns = [
        "DepDelay", "Distance", "DayOfMonth", "DayOfWeek", "DayOfYear"
    ]
    index_columns = [
        "Carrier_index", "Origin_index", "Dest_index", "Route_index"
    ]
    vector_assembler = VectorAssembler(inputCols=numeric_columns +
                                       index_columns,
                                       outputCol="Features_vec")
    final_vectorized_features = vector_assembler.transform(
        ml_bucketized_features)

    # Save the numeric vector assembler
    vector_assembler_path = "{}/numeric_vector_assembler.bin".format(
        MODELS_MOUNTPATH)
    vector_assembler.write().overwrite().save(vector_assembler_path)

    # Drop the index columns
    for column in index_columns:
        final_vectorized_features = final_vectorized_features.drop(column)

    # Inspect the finalized features
    final_vectorized_features.show()

    # Instantiate and fit random forest classifier on all the data
    from pyspark.ml.classification import RandomForestClassifier
    rfc = RandomForestClassifier(featuresCol="Features_vec",
                                 labelCol="ArrDelayBucket",
                                 predictionCol="Prediction",
                                 maxBins=4657,
                                 maxMemoryInMB=1024)
    model = rfc.fit(final_vectorized_features)

    # Save the new model over the old one
    model_output_path = "{}/spark_random_forest_classifier.flight_delays.5.0.bin".format(
        MODELS_MOUNTPATH)
    model.write().overwrite().save(model_output_path)

    # Evaluate model using test data
    predictions = model.transform(final_vectorized_features)

    from pyspark.ml.evaluation import MulticlassClassificationEvaluator
    evaluator = MulticlassClassificationEvaluator(predictionCol="Prediction",
                                                  labelCol="ArrDelayBucket",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)
    print("Accuracy = {}".format(accuracy), flush=True)

    # Check the distribution of predictions
    predictions.groupBy("Prediction").count().show()

    # Check a sample
    predictions.sample(False, 0.001, 18).orderBy("CRSDepTime").show(6)

Example #34

0

Show file

File: decision tree.py Project: magyarr/Forex

#SPARK SQL
dataframe = pycsv.csvToDataFrame(sqlContext, rddUSD, sep=",")
dataframe.registerTempTable("dataUSDuprv")
dff1 = sqlContext.sql("SELECT closeJPY FROM dataUSDuprv").show()
dataframe.show()

#LabeledPoint
lpUSD = vectorsUSD.map(transformationDT.transformToLabeledPoint)
lpUSD.take(5)
dfUSD = sqlContext.createDataFrame(lpUSD, ["label", "features"])
dfUSD.select("label", "features").show(10)

#String Indexer
stringIndexer = StringIndexer(inputCol="label", outputCol="indexed")
si_model = stringIndexer.fit(dfUSD)
td = si_model.transform(dfUSD)
td.collect()
td.show()

#Splitting data
(trainingData, testData) = td.randomSplit([0.6, 0.4])
trainingData.count()
testData.count()
testData.collect()

#Creating decision tree model
dtClassifer = DecisionTreeClassifier(labelCol="indexed",
                                     minInstancesPerNode=1500)
dtModel = dtClassifer.fit(trainingData)
dtModel.numNodes

Example #35

0

Show file

File: 09_LinReg_Project.py Project: EdoardoCarlesi/PySpark

spark = SparkSession.builder.appName('lr_ex').getOrCreate()

base_path = '/home/edoardo/Udemy/PySpark/Python-and-Spark-for-Big-Data-master/Spark_for_Machine_Learning/Linear_Regression/'
file_name = 'cruise_ship_info.csv'

data = spark.read.csv(base_path + file_name, inferSchema=True, header=True)
data.printSchema()

data.select(corr('crew', 'passengers')).show()

#print(data.columns)
#print(data.groupBy('Cruise_line').count())

# This one transforms the strings into numbers
indexer = StringIndexer(inputCol='Cruise_line', outputCol='Cruise_cat')
indexed = indexer.fit(data).transform(data)

inCols = [
    'Age', 'Tonnage', 'passengers', 'length', 'cabins', 'passenger_density',
    'Cruise_cat'
]

# Including Cruse_cat makes things worse ?!?!
#inCols = ['Age', 'Tonnage', 'passengers', 'length', 'cabins', 'passenger_density']
assembler = VectorAssembler(inputCols=inCols, outputCol='features')
output = assembler.transform(indexed)
indexed.show()

final_data = output.select('features', 'crew')
train_data, test_data = final_data.randomSplit([0.6, 0.4])

Example #36

0

Show file

File: IndexToString.py Project: tianqinglei/Spark

from pyspark.ml.feature import IndexToString, StringIndexer

from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession \
        .builder \
        .appName("IndexToStringExample") \
        .getOrCreate()

    df = spark.createDataFrame([(0, "a"), (1, "b"), (2, "c"), (3, "a"),
                                (4, "a"), (5, "c")], ["id", "category"])

    indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
    model = indexer.fit(df)
    indexed = model.transform(df)

    print("Transformed string column '%s' to indexed column '%s'" %
          (indexer.getInputCol(), indexer.getOutputCol()))
    indexed.show()

    print("StringIndexer will store labels in output column metadata\n")

    converter = IndexToString(inputCol="categoryIndex",
                              outputCol="originalCategory")
    converted = converter.transform(indexed)

    print(
        "Transformed indexed column '%s' back to original string column '%s' using "
        "labels in metadata" %

Example #37

0

Show file

def run_similar(mysql_user, mysql_pwd, mysql_host, mysql_db, kaiguan=1):

    sc = SparkContext(appName="calculate similar matrix",
                      master="spark://master:7077")
    sqlContext = SQLContext(sc)

    # 创建连接获取数据
    # DataFrame

    df_movieinfo = sqlContext.read.format("jdbc")\
            .option("url", "jdbc:mysql://"+mysql_host+":3306/"+mysql_db)\
            .option("dbtable", "movies_movieinfo")\
            .option("user", mysql_user)\
            .option("password",mysql_pwd)\
            .load()
    stringIndexer = StringIndexer(inputCol="directors",
                                  outputCol="director_Index")
    model = stringIndexer.fit(df_movieinfo)
    indexed = model.transform(df_movieinfo)

    encoder = OneHotEncoder(inputCol="director_Index", outputCol="direcVec")
    encoded = encoder.transform(indexed)
    encoded.select('direcVec').show()

    # 根据python的返回值类型定义好spark对应的数据类型
    # python函数中返回的是string，对应的pyspark是StringType
    segUDF = psf.UserDefinedFunction(seg, StringType())

    # 使用withColumn函数增加列
    df_seg = df_movieinfo.withColumn('description_2', segUDF('description'))

    # word2vec(df_movieinfo, "description", "result")
    #3.使用tokenizer分词
    tokenizer = Tokenizer(inputCol="description_2", outputCol="words")
    t_words = tokenizer.transform(df_seg)

    if kaiguan == 0:
        hashingTF = HashingTF(inputCol="words",
                              outputCol="rawFeatures",
                              numFeatures=100)
        featurizedData = hashingTF.transform(t_words)
        idf = IDF(inputCol="rawFeatures", outputCol="features")
        idfModel = idf.fit(featurizedData)
        normalizer = Normalizer(inputCol="features", outputCol="norm", p=2.0)
        dot_udf = psf.udf(lambda x, y: float(x.dot(y)), DoubleType())
        rescaledData = idfModel.transform(featurizedData)
        df_norm = normalizer.transform(rescaledData)
        # ??
        similarity_idf = df_norm.alias("item1").join(df_norm.alias("item2"), psf.col("item1.ID") < psf.col("item2.ID"))\
            .select(
                psf.col("item1.ID").alias("item1"),
                psf.col("item2.ID").alias("item2"),
                dot_udf("item1.norm", "item2.norm").alias("similar"))\
            .sort("item1", "item2")
        # 创建连接写入数据
        similarity_idf.write.format("jdbc").option("url", "jdbc:mysql://"+mysql_host+":3306/"+mysql_db)\
        .option("dbtable", "xxxxxxxxx").option("user", mysql_user).option("password",mysql_pwd).mode('append').save()

    elif kaiguan == 1:

        #4.将文本向量转换成稀疏表示的数值向量（字符频率向量）
        cv = CountVectorizer(inputCol="words",
                             outputCol="features",
                             vocabSize=5,
                             minDF=2.0)
        cv_model = cv.fit(t_words)
        cv_result = cv_model.transform(t_words)
        #5.将tokenizer得到的分词结果转换数字向量
        word2Vec = Word2Vec(vectorSize=100,
                            minCount=0,
                            inputCol="words",
                            outputCol="result")
        w2v_model = word2Vec.fit(cv_result)
        result = w2v_model.transform(cv_result)
        normalizer = Normalizer(inputCol="result", outputCol="norm", p=2.0)
        data = normalizer.transform(result)
        dot_udf = psf.udf(lambda x, y: float(x.dot(y)), DoubleType())
        similarity_w2v = data.alias("item1").join(data.alias("item2"), psf.col("item1.ID") < psf.col("item2.ID"))\
            .select(
                psf.col("item1.ID").alias("item1"),
                psf.col("item2.ID").alias("item2"),
                dot_udf("item1.norm", "item2.norm").alias("dot"))\
            .sort("item1", "item2")

        # 创建连接写入数据
        similarity_w2v.write.format("jdbc")\
                .option("url", "jdbc:mysql://"+mysql_host+":3306/"+mysql_db)\
                .option("dbtable", "movies_moviesimilar_fromspark")\
                .option("user", mysql_user)\
                .option("password",mysql_pwd)\
                .mode('append').save()

Example #38

0

Show file

File: spark101.py Project: ChienHsiung/python

    data = data.filter(lambda x:x.split(',')[0] != 'label').map(lambda line: line.split(','))
    if train:
        data = data.map(
            lambda line: (Vectors.dense(np.asarray(line[1:]).astype(np.float32)),
                          'class_'+str(line[0]),int(line[0])) )
    else:
        # Test data gets dummy labels. We need the same structure as in Train data
        data = data.map( lambda line: (Vectors.dense(np.asarray(line[1:]).astype(np.float32)),'class_'+str(line[0]),int(line[0])) ) 
    return sqlcontext.createDataFrame(data, ['features', 'category','label'])

train_df = load_data_frame("train.csv")
test_df = load_data_frame("test.csv", shuffle=False, train=False)
from pyspark.ml.feature import StringIndexer

string_indexer = StringIndexer(inputCol="category", outputCol="index_category")
fitted_indexer = string_indexer.fit(train_df)
indexed_df = fitted_indexer.transform(train_df)

from distkeras.transformers import *
from pyspark.ml.feature import OneHotEncoder
####OneHot
nb_classes = 9
encoder = OneHotTransformer(nb_classes, input_col='label', output_col="label_encoded")
dataset_train = encoder.transform(indexed_df)
dataset_test = encoder.transform(test_df)

###encoder
from pyspark.ml.feature import MinMaxScaler
transformer = MinMaxTransformer(n_min=0.0, n_max=1.0, \
                                o_min=0.0, o_max=250.0, \
                                input_col="features", \

Example #39

0

Show file

from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=X_col, outputCol=Y_col)
from pyspark.ml.feature import OneHotEncoder, StringIndexer
encoder = OneHotEncoder(inputCol="indexed", outputCol="features")

df = spark.createDataFrame([
    (0, "a"),
    (1, "b"),
    (2, "c"),
    (3, "a"),
    (4, "a"),
    (5, "c")
], ["id", "category"])

stringIndexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
model = stringIndexer.fit(df)
indexed = model.transform(df)
indexed.printSchema()
indexed.take(1)
indexed.take(5)


train_Y.printSchema()
train_X.printSchema()
train_X.schema.json()
train_X.columns
spark_df.columns
import json
data_schema = json.loads(train_X.schema.json())
isinstance(data_schema, dict)
StringIndexer(inputCol='x1', outputCol='indexed_x1')

Example #40

0

Show file

File: index_to_string_example.py Project: lhfei/spark-in-action

# $example off$
from pyspark.sql import SparkSession

if __name__ == "__main__":
    spark = SparkSession\
        .builder\
        .appName("IndexToStringExample")\
        .getOrCreate()

    # $example on$
    df = spark.createDataFrame(
        [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
        ["id", "category"])

    indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
    model = indexer.fit(df)
    indexed = model.transform(df)

    print("Transformed string column '%s' to indexed column '%s'"
          % (indexer.getInputCol(), indexer.getOutputCol()))
    indexed.show()

    print("StringIndexer will store labels in output column metadata\n")

    converter = IndexToString(inputCol="categoryIndex", outputCol="originalCategory")
    converted = converter.transform(indexed)

    print("Transformed indexed column '%s' back to original string column '%s' using "
          "labels in metadata" % (converter.getInputCol(), converter.getOutputCol()))
    converted.select("id", "categoryIndex", "originalCategory").show()
    # $example off$

Example #41

0

Show file

File: naivebys.py Project: nimms9/2019_Summer_CSEE5590_Big_Data_Programming

# Create spark session
spark = SparkSession.builder.appName("ICP7").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")

# Define input path
input_path = "C:\\Users\\Lenovo\\PycharmProjects\\M2_ICP7"

# Load data and select feature and label columns
data = spark.read.format("csv").option("header", True).option("inferSchema", True).option("delimiter", ",").load(input_path + "\\adult.csv")
data = data.withColumnRenamed("age", "label").select("label", col("education-num").alias("education-num"), col(" hours-per-week").alias("hours-per-week"),col(" education").alias("education"),col(" fnlwgt").alias("fnlwgt"),col(" sex").alias("sex"),col(" relationship").alias("relationship"))
data = data.select(data.label.cast("double"),"education-num", "hours-per-week","education","sex","fnlwgt","relationship")

new_data=data.toDF("label","education-num","hours-per-week","education","sex","fnlwgt","relationship")
indexer = StringIndexer(inputCol="education", outputCol="new_education")
indexed = indexer.fit(new_data).transform(new_data)

indexer1 = StringIndexer(inputCol="sex", outputCol="new_sex")
indexed1 = indexer1.fit(indexed).transform(indexed)

indexer2= StringIndexer(inputCol="relationship",outputCol="new_rel")
indexed2= indexer2.fit(indexed1).transform(indexed1)

indexed2=indexed2.drop("sex","education","relationship")
indexed2.show()


# Create vector assembler for feature columns
assembler = VectorAssembler(inputCols=indexed2.columns[1:], outputCol="features")
data = assembler.transform(indexed2)

Example #42

0

Show file

File: pyspark_gradient_boosted_trees.py Project: bravekjh/Spark

    rf = GBTRegressor(maxIter=30, maxDepth=4, labelCol="indexedLabel")

    model = rf.fit(train)
    predictionAndLabels = model.transform(test).select("prediction", "indexedLabel") \
        .map(lambda x: (x.prediction, x.indexedLabel))

    metrics = RegressionMetrics(predictionAndLabels)
    print("rmse %.3f" % metrics.rootMeanSquaredError)
    print("r2 %.3f" % metrics.r2)
    print("mae %.3f" % metrics.meanAbsoluteError)


if __name__ == "__main__":
    if len(sys.argv) > 1:
        print("Usage: gradient_boosted_trees", file=sys.stderr)
        exit(1)
    sc = SparkContext(appName="Jay")
    sqlContext = SQLContext(sc)

    # Load and parse the data file into a dataframe.
    df = MLUtils.loadLibSVMFile(sc, "data/mllib/sample_libsvm_data.txt").toDF()

    # Map labels into an indexed column of labels in [0, numLabels)
    stringIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel")
    si_model = stringIndexer.fit(df)
    td = si_model.transform(df)
    [train, test] = td.randomSplit([0.7, 0.3])
    testClassification(train, test)
    testRegression(train, test)
    sc.stop()

Example #43

0

Show file

    cluster = Cluster(['192.168.246.236'])
    session = cluster.connect("dev")

    sc = SparkContext(conf=conf)
    sql = SQLContext(sc)
    spark = SparkSession(sc)

    print("SparkContext => ", sc)
    print("SQLContext => ", sql)

    stations = sql.read.format("org.apache.spark.sql.cassandra").load(
        keyspace="dev", table="station")
    clean_data = sql.read.format("org.apache.spark.sql.cassandra").load(
        keyspace="dev", table="clean_daily_measurement")

    stationsIds = getStationsIds(stations)
    stationCount = 1

    ##Make a join clean_data with stations in order to get the province
    joinedData = clean_data.join(stations, ["station_id"])

    indexer = StringIndexer(inputCol="province", outputCol="stationIndex")
    indexed = indexer.fit(joinedData).transform(joinedData)

    doBayes(indexed)

    print("--- %s seconds ---" % (time.time() - start_time))
    print("END!!!")
    sc.stop()

Example #44

0

Show file

File: script5.py Project: pifouuu/ProjetBigData

# In[18]:

from pyspark.ml.feature import StringIndexer
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import MulticlassClassificationEvaluator


print "Fitting the classifier on bigram features"
t0 = time()

string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
lr = LogisticRegression(featuresCol='bigramVectors',labelCol='target_indexed',maxIter=30, regParam=0.01)
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision')

string_indexer_model = string_indexer.fit(dfBigram)
dfTrainIndexed = string_indexer_model.transform(dfBigram).cache()
lrModel = lr.fit(dfTrainIndexed)

tt = time() - t0
print "Done in {} second".format(round(tt,3))


# In[19]:

print "Testing precision of the model"
t0 = time()

dfValidSelect=dfValid.map(partial(vectorizeBi,dico=dict_broad.value)).toDF(['bigramVectors','label']).cache()
dfValidIndexed = string_indexer_model.transform(dfValidSelect).cache()
df_valid_pred = lrModel.transform(dfValidIndexed).cache()

Example #45

0

Show file

File: spark_model_with_airplanes.py Project: rjurney/Agile_Data_Code_2

def main(base_path):
  APP_NAME = "train_spark_mllib_model.py"
  
  # If there is no SparkSession, create the environment
  try:
    sc and spark
  except NameError as e:
    import findspark
    findspark.init()
    import pyspark
    import pyspark.sql
    
    sc = pyspark.SparkContext()
    spark = pyspark.sql.SparkSession(sc).builder.appName(APP_NAME).getOrCreate()
  
  #
  # {
  #   "ArrDelay":5.0,"CRSArrTime":"2015-12-31T03:20:00.000-08:00","CRSDepTime":"2015-12-31T03:05:00.000-08:00",
  #   "Carrier":"WN","DayOfMonth":31,"DayOfWeek":4,"DayOfYear":365,"DepDelay":14.0,"Dest":"SAN","Distance":368.0,
  #   "FlightDate":"2015-12-30T16:00:00.000-08:00","FlightNum":"6109","Origin":"TUS"
  # }
  #
  from pyspark.sql.types import StringType, IntegerType, FloatType, DoubleType, DateType, TimestampType
  from pyspark.sql.types import StructType, StructField
  from pyspark.sql.functions import udf
  
  schema = StructType([
    StructField("ArrDelay", DoubleType(), True),
    StructField("CRSArrTime", TimestampType(), True),
    StructField("CRSDepTime", TimestampType(), True),
    StructField("Carrier", StringType(), True),
    StructField("DayOfMonth", IntegerType(), True),
    StructField("DayOfWeek", IntegerType(), True),
    StructField("DayOfYear", IntegerType(), True),
    StructField("DepDelay", DoubleType(), True),
    StructField("Dest", StringType(), True),
    StructField("Distance", DoubleType(), True),
    StructField("FlightDate", DateType(), True),
    StructField("FlightNum", StringType(), True),
    StructField("Origin", StringType(), True),
    StructField("Route", StringType(), True),
    StructField("TailNum", StringType(), True),
    StructField("EngineManufacturer", StringType(), True),
    StructField("EngineModel", StringType(), True),
    StructField("Manufacturer", StringType(), True),
    StructField("ManufacturerYear", StringType(), True),
    StructField("OwnerState", StringType(), True),
  ])
  
  input_path = "{}/data/simple_flight_delay_features_airplanes.json".format(
    base_path
  )
  features = spark.read.json(input_path, schema=schema)
  features.first()
  
  #
  # Add the hour of day of scheduled arrival/departure
  #
  from pyspark.sql.functions import hour
  features_with_hour = features.withColumn(
    "CRSDepHourOfDay",
    hour(features.CRSDepTime)
  )
  features_with_hour = features_with_hour.withColumn(
    "CRSArrHourOfDay",
    hour(features.CRSArrTime)
  )
  features_with_hour.select("CRSDepTime", "CRSDepHourOfDay", "CRSArrTime", "CRSArrHourOfDay").show()
  
  #
  # Check for nulls in features before using Spark ML
  #
  null_counts = [(column, features_with_hour.where(features_with_hour[column].isNull()).count()) for column in features_with_hour.columns]
  cols_with_nulls = filter(lambda x: x[1] > 0, null_counts)
  print("\nNull Value Report")
  print("-----------------")
  print(tabulate(cols_with_nulls, headers=["Column", "Nulls"]))
  
  #
  # Use pysmark.ml.feature.Bucketizer to bucketize ArrDelay into on-time, slightly late, very late (0, 1, 2)
  #
  from pyspark.ml.feature import Bucketizer
  
  # Setup the Bucketizer
  splits = [-float("inf"), -15.0, 0, 30.0, float("inf")]
  arrival_bucketizer = Bucketizer(
    splits=splits,
    inputCol="ArrDelay",
    outputCol="ArrDelayBucket"
  )
  
  # Save the model
  arrival_bucketizer_path = "{}/models/arrival_bucketizer_2.0.bin".format(base_path)
  arrival_bucketizer.write().overwrite().save(arrival_bucketizer_path)
  
  # Apply the model
  ml_bucketized_features = arrival_bucketizer.transform(features_with_hour)
  ml_bucketized_features.select("ArrDelay", "ArrDelayBucket").show()
  
  #
  # Extract features tools in with pyspark.ml.feature
  #
  from pyspark.ml.feature import StringIndexer, VectorAssembler
  
  # Turn category fields into indexes
  string_columns = ["Carrier", "Origin", "Dest", "Route",
                    "TailNum"]
  for column in string_columns:
    string_indexer = StringIndexer(
      inputCol=column,
      outputCol=column + "_index"
    )
    
    string_indexer_model = string_indexer.fit(ml_bucketized_features)
    ml_bucketized_features = string_indexer_model.transform(ml_bucketized_features)
    
    # Save the pipeline model
    string_indexer_output_path = "{}/models/string_indexer_model_4.0.{}.bin".format(
      base_path,
      column
    )
    string_indexer_model.write().overwrite().save(string_indexer_output_path)
  
  # Combine continuous, numeric fields with indexes of nominal ones
  # ...into one feature vector
  numeric_columns = [
    "DepDelay", "Distance",
    "DayOfYear",
    "CRSDepHourOfDay",
    "CRSArrHourOfDay"]
  index_columns = [column + "_index" for column in string_columns]
  
  vector_assembler = VectorAssembler(
    inputCols=numeric_columns + index_columns,
    outputCol="Features_vec"
  )
  final_vectorized_features = vector_assembler.transform(ml_bucketized_features)
  
  # Save the numeric vector assembler
  vector_assembler_path = "{}/models/numeric_vector_assembler_5.0.bin".format(base_path)
  vector_assembler.write().overwrite().save(vector_assembler_path)
  
  # Drop the index columns
  for column in index_columns:
    final_vectorized_features = final_vectorized_features.drop(column)
  
  # Inspect the finalized features
  final_vectorized_features.show()
  
  #
  # Cross validate, train and evaluate classifier: loop 5 times for 4 metrics
  #
  
  from collections import defaultdict
  scores = defaultdict(list)
  feature_importances = defaultdict(list)
  metric_names = ["accuracy", "weightedPrecision", "weightedRecall", "f1"]
  split_count = 3
  
  for i in range(1, split_count + 1):
    print("\nRun {} out of {} of test/train splits in cross validation...".format(
      i,
      split_count,
    )
    )
    
    # Test/train split
    training_data, test_data = final_vectorized_features.randomSplit([0.8, 0.2])
    
    # Instantiate and fit random forest classifier on all the data
    from pyspark.ml.classification import RandomForestClassifier
    rfc = RandomForestClassifier(
      featuresCol="Features_vec",
      labelCol="ArrDelayBucket",
      predictionCol="Prediction",
      maxBins=4896,
    )
    model = rfc.fit(training_data)
    
    # Save the new model over the old one
    model_output_path = "{}/models/spark_random_forest_classifier.flight_delays.baseline.bin".format(
      base_path
    )
    model.write().overwrite().save(model_output_path)
    
    # Evaluate model using test data
    predictions = model.transform(test_data)
    
    # Evaluate this split's results for each metric
    from pyspark.ml.evaluation import MulticlassClassificationEvaluator
    for metric_name in metric_names:
      evaluator = MulticlassClassificationEvaluator(
        labelCol="ArrDelayBucket",
        predictionCol="Prediction",
        metricName=metric_name
      )
      score = evaluator.evaluate(predictions)
      
      scores[metric_name].append(score)
      print("{} = {}".format(metric_name, score))
    
    #
    # Collect feature importances
    #
    feature_names = vector_assembler.getInputCols()
    feature_importance_list = model.featureImportances
    for feature_name, feature_importance in zip(feature_names, feature_importance_list):
      feature_importances[feature_name].append(feature_importance)
  
  #
  # Evaluate average and STD of each metric and print a table
  #
  import numpy as np
  score_averages = defaultdict(float)
  
  # Compute the table data
  average_stds = []  # ha
  for metric_name in metric_names:
    metric_scores = scores[metric_name]
    
    average_accuracy = sum(metric_scores) / len(metric_scores)
    score_averages[metric_name] = average_accuracy
    
    std_accuracy = np.std(metric_scores)
    
    average_stds.append((metric_name, average_accuracy, std_accuracy))
  
  # Print the table
  print("\nExperiment Log")
  print("--------------")
  print(tabulate(average_stds, headers=["Metric", "Average", "STD"]))
  
  #
  # Persist the score to a sccore log that exists between runs
  #
  import pickle
  
  # Load the score log or initialize an empty one
  try:
    score_log_filename = "{}/models/score_log.pickle".format(base_path)
    score_log = pickle.load(open(score_log_filename, "rb"))
    if not isinstance(score_log, list):
      score_log = []
  except IOError:
    score_log = []
  
  # Compute the existing score log entry
  score_log_entry = {
    metric_name: score_averages[metric_name] for metric_name in metric_names
  }
  
  # Compute and display the change in score for each metric
  try:
    last_log = score_log[-1]
  except (IndexError, TypeError, AttributeError):
    last_log = score_log_entry
  
  experiment_report = []
  for metric_name in metric_names:
    run_delta = score_log_entry[metric_name] - last_log[metric_name]
    experiment_report.append((metric_name, run_delta))
  
  print("\nExperiment Report")
  print("-----------------")
  print(tabulate(experiment_report, headers=["Metric", "Score"]))
  
  # Append the existing average scores to the log
  score_log.append(score_log_entry)
  
  # Persist the log for next run
  pickle.dump(score_log, open(score_log_filename, "wb"))
  
  #
  # Analyze and report feature importance changes
  #
  
  # Compute averages for each feature
  feature_importance_entry = defaultdict(float)
  for feature_name, value_list in feature_importances.items():
    average_importance = sum(value_list) / len(value_list)
    feature_importance_entry[feature_name] = average_importance
  
  # Sort the feature importances in descending order and print
  import operator
  sorted_feature_importances = sorted(
    feature_importance_entry.items(),
    key=operator.itemgetter(1),
    reverse=True
  )
  
  print("\nFeature Importances")
  print("-------------------")
  print(tabulate(sorted_feature_importances, headers=['Name', 'Importance']))
  
  #
  # Compare this run's feature importances with the previous run's
  #
  
  # Load the feature importance log or initialize an empty one
  try:
    feature_log_filename = "{}/models/feature_log.pickle".format(base_path)
    feature_log = pickle.load(open(feature_log_filename, "rb"))
    if not isinstance(feature_log, list):
      feature_log = []
  except IOError:
    feature_log = []
  
  # Compute and display the change in score for each feature
  try:
    last_feature_log = feature_log[-1]
  except (IndexError, TypeError, AttributeError):
    last_feature_log = defaultdict(float)
    for feature_name, importance in feature_importance_entry.items():
      last_feature_log[feature_name] = importance
  
  # Compute the deltas
  feature_deltas = {}
  for feature_name in feature_importances.keys():
    run_delta = feature_importance_entry[feature_name] - last_feature_log[feature_name]
    feature_deltas[feature_name] = run_delta
  
  # Sort feature deltas, biggest change first
  import operator
  sorted_feature_deltas = sorted(
    feature_deltas.items(),
    key=operator.itemgetter(1),
    reverse=True
  )
  
  # Display sorted feature deltas
  print("\nFeature Importance Delta Report")
  print("-------------------------------")
  print(tabulate(sorted_feature_deltas, headers=["Feature", "Delta"]))
  
  # Append the existing average deltas to the log
  feature_log.append(feature_importance_entry)
  
  # Persist the log for next run
  pickle.dump(feature_log, open(feature_log_filename, "wb"))

Example #46

0

Show file

    mc = BinaryClassificationEvaluator(rawPredictionCol="prediction",
                                       labelCol="label")
    cv = CrossValidator(estimator=estimator,
                        estimatorParamMaps=paramGrid,
                        evaluator=mc,
                        numFolds=2)

    # for row in train_df.rdd.collect():
    #     print("row: ", row.uri)
    #     load_image_from_uri(row.uri)

    # cvModel = cv.fit(train_df)
    # mc.evaluate(cvModel.transform(test_df))
    #

    stringIndexer = StringIndexer(inputCol="label_name",
                                  outputCol="categoryIndex")
    indexed_dateset = stringIndexer.fit(train_df).transform(train_df)

    # encoder = OneHotEncoder(inputCol="categoryIndex", outputCol="categoryVec")

    encoder = OneHotEncoderEstimator(inputCols=["categoryIndex"],
                                     outputCols=["categoryVec"])

    encoder_model = encoder.fit(indexed_dateset)

    image_dataset = encoder_model.transform(indexed_dateset)

    image_dataset.show()

    transformers = estimator.fit(image_dataset)

Example #47

0

Show file

File: user-restaurant-data-proc.py Project: vivsiv/249_project

		 False if r.attributes['Good For'] is None else r.attributes['Good For']['dinner'],
		 False if r.attributes['Good For'] is None else r.attributes['Good For']['lunch'],
		 False if r.attributes['Good For'] is None else r.attributes['Good For']['breakfast'],
		 False if r.attributes['Ambience'] is None else r.attributes['Ambience']['romantic'],
		 False if r.attributes['Ambience'] is None else r.attributes['Ambience']['upscale'],
		 False if r.attributes['Ambience'] is None else r.attributes['Ambience']['casual'],
		 False if (r.attributes['Alcohol'] is None or r.attributes['Alcohol'] == 'none') else True,
		 False if r.attributes['Take-out'] is None else r.attributes['Take-out']]
	).toDF(clustering_columns)

# drop row with null values
lv_clustering_data = lv_clustering_data.dropna()

#Neighborhood feature engineering
stringIndexer = StringIndexer(inputCol="neighborhood", outputCol="neigh_index")
lv_model = stringIndexer.fit(lv_clustering_data)
lv_indexed = lv_model.transform(lv_clustering_data)
encoder = OneHotEncoder(dropLast=False, inputCol="neigh_index", outputCol="neigh_vec")
lv_encoded = encoder.transform(lv_indexed)

#initial feature set
# assembler = VectorAssembler(
#     inputCols=["stars", "price_range", "neigh_vec"],
#     outputCol="features_vec")

#expanded feature set
feature_columns = clustering_columns[2:]
feature_columns.append("neigh_vec")
assembler = VectorAssembler(
    inputCols=feature_columns,
    outputCol="features_vec")

Example #48

0

Show file

File: preprocessor.py Project: FNDaily/amazon-sagemaker-examples

def main():
    spark = SparkSession.builder.appName("PySparkTitanic").getOrCreate()
    
    args = getResolvedOptions(sys.argv, ['s3_input_data_location',
                                         's3_output_bucket',
                                         's3_output_bucket_prefix', 
                                         's3_model_bucket',
                                         's3_model_bucket_prefix'])
    
    # This is needed to write RDDs to file which is the only way to write nested Dataframes into CSV.
    spark.sparkContext._jsc.hadoopConfiguration().set("mapred.output.committer.class",
                                                      "org.apache.hadoop.mapred.FileOutputCommitter")
    
    train = spark.read.csv(args['s3_input_data_location'], header=False)
    
    
    oldColumns = train.schema.names
    newColumns = ['buying', 'maint', 'doors', 'persons', 'lug_boot', 'safety', 'cat']

    train = reduce(lambda train, idx: train.withColumnRenamed(oldColumns[idx], newColumns[idx]), xrange(len(oldColumns)), train)
    
    # dropping null values
    train = train.dropna()
    
    # Target label
    catIndexer = StringIndexer(inputCol="cat", outputCol="label")
    
    labelIndexModel = catIndexer.fit(train)
    train = labelIndexModel.transform(train)
    
    converter = IndexToString(inputCol="label", outputCol="cat")

    # Spliting in train and test set. Beware : It sorts the dataset
    (traindf, validationdf) = train.randomSplit([0.8, 0.2])
    
    # Index labels, adding metadata to the label column.
    # Fit on whole dataset to include all labels in index.
    buyingIndexer = StringIndexer(inputCol="buying", outputCol="indexedBuying")
    maintIndexer = StringIndexer(inputCol="maint", outputCol="indexedMaint")
    doorsIndexer = StringIndexer(inputCol="doors", outputCol="indexedDoors")
    personsIndexer = StringIndexer(inputCol="persons", outputCol="indexedPersons")
    lug_bootIndexer = StringIndexer(inputCol="lug_boot", outputCol="indexedLug_boot")
    safetyIndexer = StringIndexer(inputCol="safety", outputCol="indexedSafety")
    

    # One Hot Encoder on indexed features
    buyingEncoder = OneHotEncoder(inputCol="indexedBuying", outputCol="buyingVec")
    maintEncoder = OneHotEncoder(inputCol="indexedMaint", outputCol="maintVec")
    doorsEncoder = OneHotEncoder(inputCol="indexedDoors", outputCol="doorsVec")
    personsEncoder = OneHotEncoder(inputCol="indexedPersons", outputCol="personsVec")
    lug_bootEncoder = OneHotEncoder(inputCol="indexedLug_boot", outputCol="lug_bootVec")
    safetyEncoder = OneHotEncoder(inputCol="indexedSafety", outputCol="safetyVec")

    # Create the vector structured data (label,features(vector))
    assembler = VectorAssembler(inputCols=["buyingVec", "maintVec", "doorsVec", "personsVec", "lug_bootVec", "safetyVec"], outputCol="features")

    # Chain featurizers in a Pipeline
    pipeline = Pipeline(stages=[buyingIndexer, maintIndexer, doorsIndexer, personsIndexer, lug_bootIndexer, safetyIndexer, buyingEncoder, maintEncoder, doorsEncoder, personsEncoder, lug_bootEncoder, safetyEncoder, assembler])

    # Train model.  This also runs the indexers.
    model = pipeline.fit(traindf)
    
    # Delete previous data from output
    s3 = boto3.resource('s3')
    bucket = s3.Bucket(args['s3_output_bucket'])
    
    bucket.objects.filter(Prefix=args['s3_output_bucket_prefix']).delete()    

    # Save transformed training data to CSV in S3 by converting to RDD.
    transformed_traindf = model.transform(traindf)
    transformed_train_rdd = transformed_traindf.rdd.map(lambda x: (x.label, x.features))
    lines = transformed_train_rdd.map(toCSVLine)
    lines.saveAsTextFile('s3a://' + args['s3_output_bucket'] + '/' +args['s3_output_bucket_prefix'] + '/' + 'train')
    
    # Similar data processing for validation dataset.
    predictions = model.transform(validationdf)
    transformed_train_rdd = predictions.rdd.map(lambda x: (x.label, x.features))
    lines = transformed_train_rdd.map(toCSVLine)
    lines.saveAsTextFile('s3a://' + args['s3_output_bucket'] + '/' +args['s3_output_bucket_prefix'] + '/' + 'validation')

    # Serialize and store via MLeap  
    SimpleSparkSerializer().serializeToBundle(model, "jar:file:/tmp/model.zip", predictions)
    
    # Unzipping as SageMaker expects a .tar.gz file but MLeap produces a .zip file.
    import zipfile
    with zipfile.ZipFile("/tmp/model.zip") as zf:
        zf.extractall("/tmp/model")

    # Writing back the content as a .tar.gz file
    import tarfile
    with tarfile.open("/tmp/model.tar.gz", "w:gz") as tar:
        tar.add("/tmp/model/bundle.json", arcname='bundle.json')
        tar.add("/tmp/model/root", arcname='root')

    s3 = boto3.resource('s3')
    file_name = args['s3_model_bucket_prefix'] + '/' + 'model.tar.gz'
    s3.Bucket(args['s3_model_bucket']).upload_file('/tmp/model.tar.gz', file_name)

    os.remove('/tmp/model.zip')
    os.remove('/tmp/model.tar.gz')
    shutil.rmtree('/tmp/model')
    
    # Save postprocessor
    SimpleSparkSerializer().serializeToBundle(converter, "jar:file:/tmp/postprocess.zip", predictions)

    with zipfile.ZipFile("/tmp/postprocess.zip") as zf:
        zf.extractall("/tmp/postprocess")

    # Writing back the content as a .tar.gz file
    import tarfile
    with tarfile.open("/tmp/postprocess.tar.gz", "w:gz") as tar:
        tar.add("/tmp/postprocess/bundle.json", arcname='bundle.json')
        tar.add("/tmp/postprocess/root", arcname='root')

    file_name = args['s3_model_bucket_prefix'] + '/' + 'postprocess.tar.gz'
    s3.Bucket(args['s3_model_bucket']).upload_file('/tmp/postprocess.tar.gz', file_name)

    os.remove('/tmp/postprocess.zip')
    os.remove('/tmp/postprocess.tar.gz')
    shutil.rmtree('/tmp/postprocess')

Example #49

0

Show file

File: titanic_spark.py Project: R-I-S-Khan/Article-Classifier-using-Apache-Spark

def analyze(sc, train_path, test_path):
    train_rdd = sc.textFile(train_path)
    test_rdd = sc.textFile(test_path)
    train_df = parseTrain(train_rdd)
    test_df = parseTest(test_rdd)
    train_df = train_df.withColumn('Mark', lit('train'))
    test_df = (test_df.withColumn('Survived',
                                  lit(0)).withColumn('Mark', lit('test')))
    test_df = test_df[train_df.columns]
    ## Append Test data to Train data
    df = train_df.unionAll(test_df)
    df = (df.withColumn('Age', df['Age'].cast('double')).withColumn(
        'SibSp', df['SibSp'].cast('double')).withColumn(
            'Parch', df['Parch'].cast('double')).withColumn(
                'Fare', df['Fare'].cast('double')).withColumn(
                    'Survived', df['Survived'].cast('double')))
    df.printSchema()
    numVars = ['Survived', 'Age', 'SibSp', 'Parch', 'Fare']
    missing = {var: countNull(df, var) for var in numVars}
    age_mean = df.groupBy().mean('Age').first()[0]
    fare_mean = df.groupBy().mean('Fare').first()[0]
    df = df.na.fill({'Age': age_mean, 'Fare': fare_mean})
    ## created user defined function to extract title
    getTitle = udf(lambda name: name.split('.')[0].strip(), StringType())
    df = df.withColumn('Title', getTitle(df['Name']))
    df.select('Name', 'Title').show(3)
    catVars = ['Pclass', 'Sex', 'Embarked', 'Title']
    si = StringIndexer(inputCol='Sex', outputCol='Sex_indexed')
    df_indexed = si.fit(df).transform(df).drop('Sex').withColumnRenamed(
        'Sex_indexed', 'Sex')

    def indexer(df, col):
        si = StringIndexer(inputCol=col, outputCol=col + '_indexed').fit(df)
        return si

    indexers = [indexer(df, col) for col in catVars]
    pipeline = Pipeline(stages=indexers)
    df_indexed = pipeline.fit(df).transform(df)
    df_indexed.select('Embarked', 'Embarked_indexed').show(10)
    catVarsIndexed = [i + '_indexed' for i in catVars]
    featuresCol = numVars + catVarsIndexed
    featuresCol.remove('Survived')
    labelCol = ['Mark', 'Survived']
    row = Row('mark', 'label', 'features')
    df_indexed = df_indexed[labelCol + featuresCol]
    # 0-mark, 1-label, 2-features
    # map features to DenseVector
    lf = df_indexed.rdd.map(lambda r:
                            (row(r[0], r[1], DenseVector(r[2:])))).toDF()
    # index label
    # convert numeric label to categorical, which is required by
    # decisionTree and randomForest
    lf = StringIndexer(inputCol='label',
                       outputCol='index').fit(lf).transform(lf)
    lf.show(3)
    train = lf.where(lf.mark == 'train')
    test = lf.where(lf.mark == 'test')
    # random split further to get train/validate
    train, validate = train.randomSplit([0.7, 0.3], seed=121)
    print('Train Data Number of Row: ' + str(train.count()))
    print('Validate Data Number of Row: ' + str(validate.count()))
    print('Test Data Number of Row: ' + str(test.count()))
    lr = LogisticRegression(maxIter=100, regParam=0.05,
                            labelCol='index').fit(train)

    # Evaluate model based on auc ROC(default for binary classification)
    def testModel(model, validate=validate):
        pred = model.transform(validate)
        evaluator = BinaryClassificationEvaluator(labelCol='index')
        return evaluator.evaluate(pred)

    print('AUC ROC of Logistic Regression model is: ' + str(testModel(lr)))
    dt = DecisionTreeClassifier(maxDepth=3, labelCol='index').fit(train)
    rf = RandomForestClassifier(numTrees=100, labelCol='index').fit(train)
    models = {
        'LogisticRegression': lr,
        'DecistionTree': dt,
        'RandomForest': rf
    }
    modelPerf = {k: testModel(v) for k, v in models.iteritems()}
    print(modelPerf)

Example #50

0

Show file

File: stock_prediction_random_forest.py Project: meethariprasad/LearnSpark

# Create ngrams of size 2

myngram = NGram(inputCol="stopRemoved", outputCol="ngrams", n=2)
data = myngram.transform(data)
data = data.withColumn('ngrams', data.ngrams.cast(ArrayType(StringType(), True)))

# Apply count vectorizer to convert to vector of counts of the ngrams

myCountVectorizer = CountVectorizer(inputCol="ngrams", outputCol="countVect", minDF=1.0)
data = myCountVectorizer.fit(data).transform(data)

# Transform the label using StringINdexer

si_label = StringIndexer(inputCol="label", outputCol="label2", handleInvalid="skip")
data = si_label.fit(data).transform(data)
data.drop('label')
data = data.withColumn('label', data.label2)

# Divide into training and test data

trainData = data[data['Date'] < '20150101']
testData = data[data['Date'] >= '20141231']

# define the random forest classifier model

rf = RandomForestClassifier(labelCol="label", featuresCol="countVect", numTrees=3, maxDepth=4, maxBins=200)
# perform a grid search on a set of parameter values

grid = ParamGridBuilder().addGrid(rf.numTrees, [2, 5])\
                         .addGrid(rf.maxDepth, [2, 5])\

Example #51

0

Show file

File: sparkTestv6.py Project: mattosinski/big-data-seminar-pyspark

pandas_df['week'] = pandas_df['Dates'].dt.weekofyear
pandas_df['x_sim'] = pandas_df['X'].str[1:8]
pandas_df['X'] = pandas_df['X'].str[1:8]
pandas_df['y_sim'] = pandas_df['Y'].str[0:6]
pandas_df['X'] = pd.to_numeric(pandas_df['X'])
pandas_df['Y'] = pd.to_numeric(pandas_df['Y'])
pandas_df['x_sim'] = pd.to_numeric(pandas_df['x_sim'])
pandas_df['y_sim'] = pd.to_numeric(pandas_df['y_sim'])

#send back to the RDD
data_df = sqlContext.createDataFrame(pandas_df)

#encode the police dept as a feature

stringIndexer = StringIndexer(inputCol="PdDistrict", outputCol="PdDistrict_Index")
model = stringIndexer.fit(data_df)
indexed = model.transform(data_df)
encoder = OneHotEncoder(dropLast=False, inputCol="PdDistrict_Index", outputCol="pd")
encoded = encoder.transform(indexed)

#remove data_df from memory
data_df.unpersist() 

#encode the dependent variable - category_predict
classifyIndexer = StringIndexer(inputCol="Category", outputCol="Category_Index")
classifymodel = classifyIndexer.fit(encoded)
encoded2 = classifymodel.transform(encoded)



#keep the following columns: x, y, hour, day, month, year, dayofweek, week, x_sim, y_sim

Example #52

0

Show file

File: regression_sample.py Project: oopchoi/spark

                 strim(df2._c13).cast("double").alias("weight")) \
    .withColumn("grade", functions.lit("high")) \
    .withColumn("gender", functions.lit("woman"))

df9 = df3.union(df4).union(df5).union(df6).union(df7).union(df8)

# 연도, 키, 몸무게, 학년, 성별
df9.show(5, False)
df9.printSchema()

# 문자열 컬럼을 double로 변환
gradeIndexer = StringIndexer(inputCol="grade", outputCol="gradecode")

genderIndexer = StringIndexer(inputCol="gender", outputCol="gendercode")

df10 = gradeIndexer.fit(df9).transform(df9)
df11 = genderIndexer.fit(df10).transform(df10)

df11.show(3, False)
df11.printSchema()

assembler = VectorAssembler(inputCols=["height", "gradecode", "gendercode"], outputCol="features")

df12 = assembler.transform(df11)

df12.show(truncate=False)

samples = df12.randomSplit([0.7, 0.3])
training = samples[0]
test = samples[1]

Example #53

0

Show file

File: ConsumeGBNYCReg.py Project: MahsaBadami/Azure-MachineLearning-DataScience

     WHEN (pickup_hour <= 6 OR pickup_hour >= 20) THEN "Night" 
     WHEN (pickup_hour >= 7 AND pickup_hour <= 10) THEN "AMRush" 
     WHEN (pickup_hour >= 11 AND pickup_hour <= 15) THEN "Afternoon"
     WHEN (pickup_hour >= 16 AND pickup_hour <= 19) THEN "PMRush"
    END as TrafficTimeBins
    FROM taxi_test 
"""
taxi_df_test_with_newFeatures = sqlContext.sql(sqlStatement)

## CACHE DATA-FRAME IN MEMORY & MATERIALIZE DF IN MEMORY
taxi_df_test_with_newFeatures.cache()
taxi_df_test_with_newFeatures.count()

## INDEX AND ONE-HOT ENCODING
stringIndexer = StringIndexer(inputCol="vendor_id", outputCol="vendorIndex")
model = stringIndexer.fit(taxi_df_test_with_newFeatures) # Input data-frame is the cleaned one from above
indexed = model.transform(taxi_df_test_with_newFeatures)
encoder = OneHotEncoder(dropLast=False, inputCol="vendorIndex", outputCol="vendorVec")
encoded1 = encoder.transform(indexed)

stringIndexer = StringIndexer(inputCol="rate_code", outputCol="rateIndex")
model = stringIndexer.fit(encoded1)
indexed = model.transform(encoded1)
encoder = OneHotEncoder(dropLast=False, inputCol="rateIndex", outputCol="rateVec")
encoded2 = encoder.transform(indexed)

stringIndexer = StringIndexer(inputCol="payment_type", outputCol="paymentIndex")
model = stringIndexer.fit(encoded2)
indexed = model.transform(encoded2)
encoder = OneHotEncoder(dropLast=False, inputCol="paymentIndex", outputCol="paymentVec")
encoded3 = encoder.transform(indexed)

Example #54

0

Show file

File: tps_baseline.py Project: cool-pot/MusicRecommendation

from csv import reader
from pyspark.mllib.recommendation import *
from pyspark.sql.functions import format_string

tuple_path = "s3://million-song-dataset-yizhou/TasteProfile/train_triplets.txt"

df = spark.read.load(tuple_path, format="csv", sep="\t", inferSchema="true", header=None)


# Transform index
from pyspark.ml.feature import StringIndexer
user_indexer = StringIndexer(inputCol="_c0", outputCol="user_index")
song_indexer = StringIndexer(inputCol="_c1", outputCol="song_index")

partial_indexed = user_indexer.fit(df).transform(df)
indexed = song_indexer.fit(partial_indexed).transform(partial_indexed)

indexed.createOrReplaceTempView("indexed")

res_df = spark.sql("select user_index, song_index, _c2 as click from indexed")
res_df = res_df.select(format_string("%.0f,%.0f,%d",res_df.user_index,res_df.song_index,res_df.click))
rdd = res_df.rdd.flatMap(list).map(lambda x:x.split(","))
model = ALS.trainImplicit(rdd, 25, seed=10)


# Generate userIndex
temp1 = spark.sql("select distinct _c0,user_index from indexed")
user_df = temp1.select(format_string("%s,%.0f",temp1._c0,temp1.user_index))
user_df.write.save("s3://million-song-dataset-yizhou/TasteProfile/userIndex",format="text")

# Generate songIndex

Example #55

0

Show file

File: script3_bis.py Project: pifouuu/ProjetBigData

print "Creating sparse vectors for all data based on this new dictionary"
t0 = time()
dfTrainSelect=dfTrain.map(partial(vectorizeBi,dico=dictSel_broad.value)).toDF(schema)
dfTestSelect=dfTest.map(partial(vectorizeBi,dico=dictSel_broad.value)).toDF(schema)
dfTrainSelect.take(1)
dfTestSelect.take(1)
tt = time() - t0
print "Done in {} second".format(round(tt,3))


# In[328]:

from pyspark.ml.feature import StringIndexer
string_indexer = StringIndexer(inputCol='label', outputCol='target_indexed')
string_indexer_model = string_indexer.fit(dfTrainSelect)
dfTrainIndexed = string_indexer_model.transform(dfTrainSelect)


# In[329]:

from pyspark.ml.classification import DecisionTreeClassifier
dt = DecisionTreeClassifier(featuresCol='bigramVectors', labelCol='target_indexed', maxDepth=10)


# In[330]:

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(predictionCol='prediction', labelCol='target_indexed', metricName='precision')

Example #56

0

Show file

File: preprocessing.py Project: smsubrahmannian/Collaborative-Filtering

df_tip = sqlContext.read.format("com.mongodb.spark.sql.DefaultSource")\
    .option("uri","mongodb://"+ip_address+"/yelp.tip").load()

rstdata = sc.textFile('./Collaborative-Filtering/data/restaurant_ids_final.txt').map(lambda x: (x,1))\
.toDF(['business_id','biz_ix']).select('business_id')

df_review.persist()
df_tip.persist()
rstdata.persist()

## Mapping

userMap = df_review.select('user_id').union(
    df_tip.select('user_id')).distinct()
indexer_userid = StringIndexer(inputCol="user_id", outputCol="user_ix")
userMap = indexer_userid.fit(userMap).transform(userMap)

bizMap = df_review.select('business_id').union(
    df_tip.select('business_id')).distinct()
indexer_biz = StringIndexer(inputCol="business_id", outputCol="biz_ix")
bizMap = indexer_biz.fit(bizMap).transform(bizMap)
bizMap = rstdata.join(bizMap, on='business_id', how='inner')

## Join Dataframe to the review table

df_review_als = df_review.select('user_id', 'business_id', 'stars')
df_review_als = df_review_als.join(userMap, on='user_id',
                                   how='left_outer').join(bizMap,
                                                          on='business_id',
                                                          how='inner')
ALS_baseline_df = df_review_als.select('user_ix', 'biz_ix', 'stars')

Example #57

0

Show file

File: Binary Classification Algorithms.py Project: yoavfreund/databricks

# MAGIC %md
# MAGIC In this dataset, we have ordinal variables like education (Preschool - Doctorate), and also nominal variables like relationship (Wife, Husband, Own-child, etc). For simplicity's sake, we will use One-Hot Encoding to convert all categorical variables into binary vectors. It might be possible here to improve prediction accuracy by converting each categorical column with an appropriate method.
# MAGIC 
# MAGIC Here, we will use a combination of [StringIndexer](http://spark.apache.org/docs/latest/ml-features.html#stringindexer) and [OneHotEncoder](http://spark.apache.org/docs/latest/ml-features.html#onehotencoder) to convert the categorical variables. The OneHotEncoder will return a [SparseVector](https://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#pyspark.mllib.linalg.SparseVector).

# COMMAND ----------

###One-Hot Encoding
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
  
categoricalColumns = ["workclass", "education", "marital_status", "occupation", "relationship", "race", "sex", "native_country"]
for categoricalCol in categoricalColumns:
  # Category Indexing with StringIndexer
  stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol+"Index")
  model = stringIndexer.fit(dataset)
  indexed = model.transform(dataset)
  # Use OneHotEncoder to convert categorical variables into binary SparseVectors
  encoder = OneHotEncoder(inputCol=categoricalCol+"Index", outputCol=categoricalCol+"classVec")
  encoded = encoder.transform(indexed)
  dataset = encoded

print dataset.take(1)

# COMMAND ----------

# MAGIC %md
# MAGIC The above code basically indexes each categorical column using the StringIndexer, and then converts the indexed categories into one-hot encoded variables. The resulting output has the binary vectors appended to the end of each row.

# COMMAND ----------

Example #58

0

Show file

#print(df_raw6.filter(df_raw6['text'] == '').count())


# In[ ]:





# In[71]:


from pyspark.ml.feature import StringIndexer

indexer=StringIndexer(inputCol='_c14',outputCol='OpenStatus_cat')
indexed=indexer.fit(df_raw5).transform(df_raw5)


# In[72]:


indexed.show()


# In[73]:


df_raw8 = indexed.select("text","OpenStatus_cat")


# In[74]:

Example #59

0

Show file

File: string_indexer_example.py Project: 0xqq/spark

#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

from __future__ import print_function

from pyspark import SparkContext
from pyspark.sql import SQLContext
# $example on$
from pyspark.ml.feature import StringIndexer
# $example off$

if __name__ == "__main__":
    sc = SparkContext(appName="StringIndexerExample")
    sqlContext = SQLContext(sc)

    # $example on$
    df = sqlContext.createDataFrame(
        [(0, "a"), (1, "b"), (2, "c"), (3, "a"), (4, "a"), (5, "c")],
        ["id", "category"])
    indexer = StringIndexer(inputCol="category", outputCol="categoryIndex")
    indexed = indexer.fit(df).transform(df)
    indexed.show()
    # $example off$

    sc.stop()

Example #60

0

Show file

                            stopWords=None if language == "english" else 
                                StopWordsRemover.loadDefaultStopWords(language))
    df = remover.transform(df)

# Now the magic of windowing the text with F.explode()
win = windowing(winz)
decompose = win.get_udf()
df = df.withColumn("slides", decompose("tokens")) \
        .withColumn("exploded", F.explode("slides")) \
        .withColumn("word", get_mid("exploded")) \
        .withColumn("window", rm_mid("exploded"))
        
df = df.drop(*[c for c in df.columns if not c in ["word", "window"]])

indexer = StringIndexer(inputCol="word", outputCol="label")
df = indexer.fit(df).transform(df)  #.persist(StorageLevel.DISK_ONLY)#MEMORY_AND_DISK)

hashingTF = HashingTF(inputCol="window", outputCol="rawFeatures")
df = hashingTF.transform(df)

idf = IDF(inputCol="rawFeatures", outputCol="features")  #"idfFeatures")
idfModel = idf.fit(df)
df = idfModel.transform(df).drop("rawFeatures")

#pca = PCA(k=3, inputCol="idfFeatures", outputCol="features")
#model = pca.fit(df).transform(df)

train, test = df.randomSplit([0.7, 0.3], 24)
lr = LogisticRegression(regParam=0.001)
model = lr.fit(train)