Python StringIndexer.select Examples

Programming Language: Python

Namespace/Package Name: pyspark.ml.feature

Class/Type: StringIndexer

Method/Function: select

Examples at hotexamples.com: 15

The StringIndexer is a feature transformer in PySpark's ml.feature package that encodes a string column of labels to a column of numerical indices. This is useful because many machine learning algorithms require numerical inputs.

Examples:

1. Select a specific column to be indexed:

from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="label", outputCol="indexedLabel")

In this example, we create a StringIndexer object that selects the column "label" in the input data and creates a new column "indexedLabel" with numerical indices.

2. Select multiple columns to be indexed:

from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCols=["label1", "label2"], outputCols=["indexedLabel1", "indexedLabel2"])

In this example, we create a StringIndexer object that selects two columns "label1" and "label2" in the input data and creates two new columns "indexedLabel1" and "indexedLabel2" with numerical indices.

3. Describe the returned DataFrame:

from pyspark.ml.feature import StringIndexer

indexer = StringIndexer(inputCol="label", outputCol="indexedLabel")

data = spark.createDataFrame([(0, "a"), (1, "b"), (2, "a"), (3, "c"), (4, "a")], ["id", "label"])

indexed = indexer.fit(data).transform(data)

indexed.describe().show()

In this example, we create a DataFrame with five rows and two columns ("id" and "label"). We transform the "label" column into numerical indices using a StringIndexer and then describe the resulting DataFrame ("indexed"). The output shows the count, mean, standard deviation, minimum, and maximum values of the numerical indices. Package library: PySpark's ml.feature package.

Python StringIndexer.select - 15 examples found. These are the top rated real world Python examples of pyspark.ml.feature.StringIndexer.select extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

StringIndexer(30)

fit(30)

transform(30)

getOutputCol(22)

show(19)

select(15)

setHandleInvalid(14)

write(10)

drop(9)

randomSplit(8)

toPandas(4)

withColumnRenamed(4)

getInputCol(3)

withColumn(3)

groupBy(3)

where(3)

printSchema(3)

save(2)

setInputCol(2)

count(2)

take(1)

describe(1)

setOutputCol(1)

filter(1)

dropna(1)

fitAsync(1)

orderBy(1)

_call_java(1)

labels(1)

groupby(1)

getOutputCols(1)

fillna(1)

load(1)

Example #1

Show file

def string_index(spark_df):
    """Create an index for each of the categorical column of the data set."""
    for i in spark_df.columns:
        inp_col = str(i)
        out_col = str(i) + "_indexed"
        fit_on = spark_df.select(str(i))
        df_i_indexed = StringIndexer(inputCol = inp_col, outputCol = out_col).fit(fit_on).transform(fit_on)
        indexed_col = df_i_indexed.select(out_col)
        print(i)
        indexed_col.printSchema()
        out_col_ohe = str(i) + "_encoded"
        try:
            df_i_encoded = OneHotEncoder(inputCol = out_col, outputCol = out_col_ohe).transform(df_i_indexed).show()
            df_i_encoded.select(out_col_ohe)
            vecAssembler = VectorAssembler(inputCols = out_col_ohe, outputCol="features")
            vecAssembler.transform(spark_df)
        except:
            pass
    return None

Example #2

Show file

File: hw4.py Project: toosyou/big_data_analysis_hw

def load_csv(sc, filename='200[0-5].csv'):
    sql_context = SQLContext(sc)
    df = sql_context.read.option('mode', 'PERMISSIVE')\
                            .load(filename,
                            format='com.databricks.spark.csv',
                            header='true',
                            nullValue='NA',
                            inferSchema='true').cache()
    df = df[FEATURE_USED]
    df = df.na.drop()
    # turn string to index
    for col in ['UniqueCarrier', 'Origin', 'Dest']:
        df = StringIndexer(inputCol=col,
                           outputCol=col + '_value').fit(df).transform(df)
        df = df.drop(col)

    # reordering
    df = df.select([
        'Month', 'DayofMonth', 'DayOfWeek', 'CRSDepTime', 'CRSArrTime',
        'UniqueCarrier_value', 'FlightNum', 'CRSElapsedTime', 'Origin_value',
        'Dest_value', 'Distance', 'Cancelled'
    ])
    return df

Example #3

Show file

# ---------------------------------------

df3 = df2.select('Sex','Pclass','Survived','Embarked')
df3.show()
df3.printSchema()

from pyspark.ml.feature import StringIndexer
df3 = StringIndexer(inputCol='Sex',outputCol='Gender').fit(df3).transform(df3)
df3.groupby(df3.Embarked,'Embarked').agg({'Embarked':'count'}).show()
df3 = StringIndexer(inputCol='Embarked',outputCol='Embarked_Transformed').fit(df3).transform(df3)
#df3.groupby(df3.Embarked,'Embarked').agg({'Embarked':'count'}).show()
df3.show()
df3.printSchema()

df3 = df3.select(df3.Pclass.cast('double'),df3.SibSp.cast('double'),df3.Survived.cast('double'),df3.Fare.cast('double'))
df3.show()
df3.printSchema()

# Vector assembler

from pyspark.ml.feature import VectorAssembler
df3 = VectorAssembler(inputCols=['Pclass','SibSp','Fare'],outputCol='Features').transform(df3)

df3.show()
#
# 1 choose approach
from pyspark.ml.classification import DecisionTreeClassifier
dt1 = DecisionTreeClassifier(featuresCol='Features',labelCol='Survived',maxDepth=10,impurity='entropy')

# 2 learning process - created a model

Example #4

Show file

df3.show()

df3 = OneHotEncoder(inputCol='Embarked1',
                    outputCol='Embarked2',
                    dropLast=False).transform(df3)
df3.show()

# --------------------------------------------

df3 = StringIndexer(inputCol='Sex', outputCol='Gender').fit(df3).transform(df3)
df3 = OneHotEncoder(inputCol='Gender', outputCol='Gender1',
                    dropLast=False).transform(df3)
df3.show()

# cast to double
df3 = df3.select(df3.Pclass.cast('double'), df3.Gender1, df3.Embarked2,
                 df3.Survived.cast('double'))
df3.printSchema()

# Vector assembler

df3 = VectorAssembler(inputCols=['Pclass', 'Gender1', 'Embarked2'],
                      outputCol='Features').transform(df3)
df3.show(truncate=False)

training = df3
training1 = df3

training.show(truncate=False, n=5)

# 1 choose approach
from pyspark.ml.classification import DecisionTreeClassifier

Example #5

Show file

df5 = spark.read.csv('E:/kaggle/titanic/test.csv',header=True).select('PassengerId','Sex','Pclass','Embarked')

df5 = StringIndexer(inputCol='Embarked',outputCol='Embarked1').fit(df5).transform(df5)
df5.show()

df5 = OneHotEncoder(inputCol='Embarked1',outputCol='Embarked2',dropLast=False).transform(df5)
df5.show()

# --------------------------------------------

df5 = StringIndexer(inputCol='Sex',outputCol='Gender').fit(df5).transform(df5)
df5 = OneHotEncoder(inputCol='Gender',outputCol='Gender1',dropLast=False).transform(df5)
df5.show()


df5 = df5.select(df5.Pclass.cast('double'),df5.Gender1,df5.Embarked2,df5.PassengerId)
df5.printSchema()

# Vector assembler

df5 = VectorAssembler(inputCols=['Pclass','Gender1','Embarked2'],outputCol='Features').transform(df5)
df5.show(truncate=False)


df5_1 = model2.transform(df5)
df5_1.show()

df5_1.select('PassengerId','prediction').coalesce(1).write.csv('c:/test5.csv')

#  df5_1.select('PassengerId','prediction').toPandas().to_csv('c:/test5.csv')

Example #6

Show file

File: pyspark_tutorials.py Project: linghui-wu/LargeScaleComputing_A20

# Make predictions on the testing data
prediction = logistic.transform(sms_test)

# Create a confusion matrix, comparing predictions to known labels
prediction.groupBy('label', 'prediction').count().show()

# One-hot encoding
from pyspark.ml.feature import OneHotEncoderEstimator
onehot = OneHotEncoderEstimator(inputCols=['type_idx'], outputCol=['type_dummy'])
# Fit the encoder to the data
onehot = onehot.fit(cars)
# How many category levels?
print(onehot.categorySizes)

cars = onehot.transform(cars)
cars.select('type', 'type_idx', 'type_dummy').distinct().sort('type_idx').show()

# Dense verse sparse
from spark.mllib.linalg import DenseVector, SparseVector
DenseVector([1, 0, 0, 0, 0, 7, 0, 0])
SparseVector(8, [0, 5], [1, 7])

# Import the one hot encoder class
from pyspark.ml.feature import OneHotEncoderEstimator

# Create an instance of the one hot encoder
onehot = OneHotEncoderEstimator(inputCols=['org_idx'], outputCols=['org_dummy'])

# Apply the one hot encoder to the flights data
onehot = onehot.fit(flights)
flights_onehot = onehot.transform(flights)

Example #7

Show file

#df3.show(10)
#df3.schema
#df3.printSchema()
## --------------------------------------------
#
#
##df4.show()
##df4.printSchema()
#
##fit(si1)
#male   = 0
#female = 1
#
##transform

df3 = df3.select(df3.Pclass.cast('double'), df3.Gender,
                 df3.Survived.cast('double'))
df3.printSchema()

# Vector assembler

from pyspark.ml.feature import VectorAssembler
df3 = VectorAssembler(inputCols=['Pclass'],
                      outputCol='Features').transform(df3)
df3.show()
#
# 1 choose approach
from pyspark.ml.classification import DecisionTreeClassifier
dt1 = DecisionTreeClassifier(featuresCol='Features', labelCol='Survived')

# 2 learning process - created a model
model2 = dt1.fit(df3)

Example #8

Show file

File: m3_demo01_KMeansClustering.py Project: GCPBigData/ds

# #### View the output of the KMeans model
# The prediction field denotes the cluster number

# In[15]:

clusterdData.toPandas().head()

# #### Get the average of each feature in the original data
# This is the equivalent of the cluster center when our dataset is one big cluster
# * We import all sql functions as we need the avg and count functions among others

# In[16]:

from pyspark.sql.functions import *

dataset.select(avg('Survived'), avg('Pclass'), avg('Age'), avg('Fare'),
               avg('Gender'), avg('Boarded')).toPandas()

# #### A more intuitive way to view the cluster centers in our clusterdData
# * We group by clusterID (prediction) and compute the average of all features
# * We do a count of values in each cluster

# In[17]:

clusterdData.groupBy('prediction').agg(
    avg('Survived'), avg('Pclass'), avg('Age'), avg('Fare'), avg('Gender'),
    avg('Boarded'), count('prediction')).orderBy('prediction').toPandas()

# #### Examine all rows in one of the clusters

# In[18]:

Example #9

Show file

File: Ex3a.1.py Project: wel51x/Machine_Learning_and_Spark

flights = flights.dropna()
print("\nThe data contains %d records after dropping records with na values." % flights.count())

# Create an indexer for carrier categorical feature
indexer = StringIndexer(inputCol="carrier", outputCol='carrier_idx')

# Indexer identifies categories in the data
indexer_model = indexer.fit(flights)

# Indexer creates a new column with numeric index values
flights_indexed = indexer_model.transform(flights)

# Repeat the process for the org categorical feature
flights_indexed = StringIndexer(inputCol="org", outputCol='org_idx').fit(flights_indexed).transform(flights_indexed)
# Check first five records
flights_indexed.show(5)

flites = flights_indexed.select('carrier', 'org', 'org_idx')

# Create an instance of the one hot encoder
onehot = OneHotEncoderEstimator(inputCols=["org_idx"], outputCols=["org_dummy"])

# Apply the one hot encoder to the flights data
onehot = onehot.fit(flites)
flights_onehot = onehot.transform(flites)

# Check the results
flights_onehot.select('org', 'org_idx', 'org_dummy').distinct().sort('org_idx').show()

spark.stop()

Example #10

Show file

File: Databricks_Churn Prediction.py Project: Betsy-Varghese/Predictive-Modeling-Python

#Check schema and first rows
customers.printSchema() #Schema is ok
customers.toPandas().head(5)

#Find missings
customers.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in customers.columns)) #No missings

#Renaming the CustomerID column for future joins
customers = customers.withColumnRenamed("CustomerID","cIDCustomer")
#DELIVERY
#Check schema and first rows
delivery.printSchema() #Schema is ok
delivery.toPandas().head(5)

#Find missings
delivery.select(*(sum(col(c).isNull().cast("int")).alias(c) for c in delivery.columns)) #Found 780 missings in the DeliveryClass column

#Treating missing values
delivery = delivery.where(col("DeliveryClass").isNotNull())

#Encoding string columns in "Delivery"
delivery = StringIndexer(inputCol="DeliveryClass", outputCol="DeliveryClass_index").fit(delivery).transform(delivery)
delivery = StringIndexer(inputCol="DeliveryTypeName", outputCol="DeliveryTypeName_index").fit(delivery).transform(delivery)

#Renaming the SubscriptionID column for future joins
delivery = delivery.withColumnRenamed("SubscriptionID","sID_Delivery")
#FORMULA
#Check schema and first rows
formula.printSchema() #Schema is ok
formula.toPandas().head(5)

Example #11

Show file

File: Ex3b.2.py Project: wel51x/Machine_Learning_and_Spark

# Convert 'mile' to 'km' and drop 'mile' column
flights = flights.withColumn('km', round(flights.mile * 1.60934, 0)) \
                 .drop('mile')

# Remove records with missing values in any column and get the number of remaining rows
flights = flights.dropna()
print("The data contains %d records after dropping records with na values." %
      flights.count())

# Create an indexer for org categorical feature
flights_indexed = StringIndexer(
    inputCol="org", outputCol='org_idx').fit(flights).transform(flights)
# Check first five records
#flights_indexed.show(5)

flites = flights_indexed.select('km', 'org_idx', 'duration')

# Create 'features' vector: 'weight_kg', 'cyl', 'type_dummy'
assembler = VectorAssembler(inputCols=['km', 'org_idx'], outputCol='features')

# Consolidate predictor columns
flights_assembled = assembler.transform(flites)

# Check the resulting column
flites = flights_assembled.select('duration', 'features')
#flites.distinct().show(8, truncate=False)

print("Sample model input")
print(flites.toPandas().sample(12))

# Split the data into training and testing sets

Example #12

Show file

File: Untitled1.py Project: sosam29/pycode

# In[95]:


df2.show(5)


# In[93]:


desidxer_df.show(5)


# In[101]:


desidxer_df.select(['air_time','distance','carrier_idx','origin_idx','dest_idx']).describe().show()


# In[103]:


desidxer_df.select(desidxer_df.air_time IsNull()')


# In[108]:


desidxer_df.select(desidxer_df.air_time =='NA')


# In[110]:

Example #13

Show file

File: train.py Project: nirajpandkar/sparkify

def extract_features(df):
    """
    Create a vector Assembler of the features.
    
    Arguments:
        df: Dataframe consisting the relevant data columns.
    Returns:
        Dataframe with extracted features in the column "features".
    """
    feature_df = df.select("userId").distinct()
    col_names = []

    ts_dt_udf = udf(lambda x: x // 1000, LongType())
    df = df.withColumn("registration_dt",
                       ts_dt_udf(df.registration).cast("timestamp"))
    df = df.withColumn("timestamp_dt", ts_dt_udf(df.ts).cast("timestamp"))

    # Session Counts
    session_counts = df.groupby('userId').agg(
        countDistinct('sessionId').alias('session_count'))

    feature_df = feature_df.join(session_counts, on="userId")
    col_names.append("session_count")

    # Page Counts
    pages = df.select('page').distinct().sort('page')
    pages_list = [r.page for r in pages.collect()]
    page_counts = df.groupby('userId').pivot('page', pages_list).count()

    # Drop the "Cancel" page column
    # Fill NaNs with 0 - This will inherently transform "Cancellation Confirmation" column into "label"
    # with 1 as churned and 0 as non churned
    page_counts = page_counts.drop("Cancel")
    page_counts = page_counts.fillna(value=0)
    page_counts = page_counts.withColumnRenamed("Cancellation Confirmation",
                                                "label")

    # Join these feature columns to our feature dataframe
    feature_df = feature_df.join(page_counts, on="userId")

    # Normalize by session counts
    cut_columns = {'userId', 'session_count', 'label'}
    remaining_cols = sorted(list(set(feature_df.columns) - cut_columns))
    for column in remaining_cols:
        feature_df = feature_df.withColumn(
            column,
            col(column) / feature_df.session_count)
    col_names.extend(remaining_cols)

    # Time since registration
    user_ages = df.select([
        "userId", datediff("timestamp_dt", "registration_dt")
    ]).groupBy("userId").max().select(
        "userId",
        col("max(datediff(timestamp_dt, registration_dt))").alias("age"))
    feature_df = feature_df.join(user_ages, on="userId")
    col_names.append("age")

    # Total number of events
    user_number_events = df.groupBy("userId").count().select(
        "userId",
        col("count").alias("num_events"))
    feature_df = feature_df.join(user_number_events, on="userId")
    col_names.append("num_events")

    # Include device categorical variable
    device_udf = udf(
        lambda x: str(re.findall(r'\((.*?)\)', x)[0].split(";")[0].split()[0])
        if x is not None else None, StringType())
    df = df.withColumn("device", device_udf(df.userAgent))

    df_device = df.select(["userId", "device"]).distinct()
    df_device = StringIndexer(
        inputCol="device",
        outputCol="device_index").fit(df_device).transform(df_device)
    df_device = OneHotEncoderEstimator(
        inputCols=["device_index"],
        outputCols=["device_classVec"]).fit(df_device).transform(df_device)
    feature_df = feature_df.join(df_device.select("userId", "device_classVec"),
                                 on="userId")
    col_names.append("device_classVec")

    print(col_names)
    # Assemble the vector
    assembler = VectorAssembler(inputCols=col_names, outputCol='features')

    return assembler.transform(feature_df)

Example #14

Show file

#then drop rows with leftover na's
#df = df.na.drop(how='any')
#df.count() #if loss is big, investigate and fill.na as needed

#otherwise, remove df2
#del(df2)

#export to csv (via coalesce)

print('\n\n\nGetting ready to write data to csv\n')

#df = df.select(float_x_vars + cat_x_vars + y_vars)
#df = df.na.drop(how='any')
#df.coalesce(1).write.csv('data/pdDataNN.csv')

#Using pandas
pdData = df.select(float_x_vars + cat_x_vars + y_vars)
pdData = pdData.na.drop(how='any')

#del(df)

#pdData.count()
#If there is a large loss, then investigate why
pdData = pdData.toPandas()
pdData.to_csv('data/pdDataNN.csv', index=False)

del (pdData)

spark.stop()

Example #15

Show file

File: main.py Project: jwzcheng/survey

col_string = col_string.iloc[:, 0].tolist()
col = set(col) - set(col_a)
col_test = set(col) - set(['HasDetections'])
col = list(col)
col_test = list(col_test)
col_test.append('MachineIdentifier')

col_si = []
for i in col:
    for j in col_string:
        if i == j:
            col_si.append(i)

col_num = list(set(col) - set(col_si))

test = test.select(col_test)

for i in col_test:
    if i == 'MachineIdentifier':
        continue
    else:
        test = StringIndexer(inputCol=i,
                             outputCol=i + "_index").fit(test).transform(test)

# encoder_input_col = []
# for i in col:
#     encoder_input_col.append(i + '_index')
# for i in col_num:
#     encoder_input_col.append(i)

encoder_input_col = [