Beispiel #1
0
 def tourTimePredict(self, data):
     '''
     Predict the tour time given data of boarding coordinate, getting off coordinate and boarding time.
     :param data: [upCoord, downCoord, upTime]
     :return: prediction of board time of the tour.
     '''
     upCoord, downCoord, upTime = data
     upCoord_idx = int((round(upCoord[1][0][0], 1) - 120.5) * 10 * 12 +
                       (round(upCoord[1][0][1], 1) - 30.4) * 10)
     manhLon = abs(upCoord[0] - downCoord[0])
     manhLat = abs(upCoord[0] - downCoord[0])
     test = self.spark.createDataFrame(
         [upTime, upCoord_idx, manhLon, manhLat],
         ['upTime', 'upCoord', 'manhLon', manhLat])
     encoder_time = OneHotEncoder(inputCol='upTime',
                                  outputCol='upTime_onehot',
                                  dropLast=False)
     encoder_coord = OneHotEncoder(inputCol='upCoord',
                                   outputCol='upCoord_onehot',
                                   dropLast=False)
     assembler = VectorAssembler(inputCols=[
         'upTime_onehot', 'upCoord_onehot', 'manhLon', 'manhLat'
     ],
                                 outputCol='features')
     testDF = assembler.transform(
         encoder_coord.transform(encoder_time.transform(test)))
     return self.model.transform(testDF).head().prediction
Beispiel #2
0
def get_kdd_data(csv):
    """
    input the filepath of the csv data, expecting this to be in the kdd format,
    with the columns in the standard order
    """
    col_names = np.loadtxt('./col_names.txt', dtype='str')
    data = pd.read_csv(csv, header=None, names=col_names)

    #want to change this so spark reads the csv straight, no pd mediation
    df = spark.createDataFrame(data)

    #one-hot encode the string columns
    one_hot_cols = ['protocol_type', 'service', 'flag']
    for label in one_hot_cols:
        stringIndexer = StringIndexer(inputCol=label,
                                      outputCol=label + "_index")
        model = stringIndexer.fit(df)
        indexed = model.transform(df)

        encoder = OneHotEncoder(inputCol=label + "_index",
                                outputCol=label + "_vec")
        encoded = encoder.transform(indexed)
        df = encoder.transform(indexed).drop(label + "_index").drop(label)

    return df
Beispiel #3
0
def encoding(df):
    encoder0 = OneHotEncoder(inputCol='salutation', outputCol='salutation_Vec')
    encoder1 = OneHotEncoder(inputCol='paymenttype', outputCol='paymenttype_Vec')
    encoder2 = OneHotEncoder(inputCol='model', outputCol='model_Vec')
    # Apply the encoder transformer
    df = encoder0.transform(df)
    df = encoder1.transform(df)
    df = encoder2.transform(df)
    return df
def one_hot_encode(dfFull):

    dfNoNullNumbers = dfFull.fillna('0')

    # start with user id
    stringIndexer_user = StringIndexer(inputCol="userid", outputCol="UserIdx", handleInvalid='keep')
    model_user = stringIndexer_user.fit(dfNoNullNumbers)
    indexed_user = model_user.transform(dfNoNullNumbers)

    encoder_user = OneHotEncoder(inputCol="UserIdx", outputCol="UserVec")
    encoded_user = encoder_user.transform(indexed_user)
    
    # now do item
    stringIndexer_item = StringIndexer(inputCol="product", outputCol="ItemIdx", handleInvalid='keep')
    model_item = stringIndexer_item.fit(encoded_user)
    indexed_item = model_item.transform(encoded_user)

    encoder_item = OneHotEncoder(inputCol="ItemIdx", outputCol="ItemVec")
    encoded_item = encoder_item.transform(indexed_item)

    # now do category
    stringIndexer_cat = StringIndexer(inputCol="category", outputCol="CategoryCleanedIdx", handleInvalid='keep')
    model_cat = stringIndexer_cat.fit(encoded_item)
    indexed_cat = model_cat.transform(encoded_item)

    encoder_cat = OneHotEncoder(inputCol="CategoryCleanedIdx", outputCol="CategoryCleanedVec")
    encoded_cat = encoder_cat.transform(indexed_cat)

    # now do offerid
    stringIndexer_offer = StringIndexer(inputCol="offerid", outputCol="OfferIdx", handleInvalid='keep')
    model_offer = stringIndexer_offer.fit(encoded_cat)
    indexed_offer = model_offer.transform(encoded_cat)

    encoder_offer = OneHotEncoder(inputCol="OfferIdx", outputCol="OfferVec")
    encoded_offer = encoder_offer.transform(indexed_offer)

    # now do countrycode
    stringIndexer_cc = StringIndexer(inputCol="countrycode", outputCol="CcIdx", handleInvalid='keep')
    model_cc = stringIndexer_cc.fit(encoded_offer)
    indexed_cc = model_cc.transform(encoded_offer)

    encoder_cc = OneHotEncoder(inputCol="CcIdx", outputCol="CcVec")
    encoded_cc = encoder_cc.transform(indexed_cc)

    dfEncoded = encoded_cc.drop('userid').drop('UserIdx') \
        .drop('product').drop('ItemIdx') \
        .drop('offerid').drop('OfferIdx') \
        .drop('countrycode').drop('CcIdx') \
        .drop('category').drop('CategoryCleanedIdx') 

    return dfEncoded
def onehot_encoder(df, features_categorical_indexed):
    for s in features_categorical_indexed:
        encoder = OneHotEncoder(dropLast=True,
                                inputCol=s,
                                outputCol=(s + "_Vec"))
        df = encoder.transform(df)
    return df
Beispiel #6
0
    def __clean_data(self, df, is_fraud="isfraud"):
        ignore = [is_fraud, 'label']

        #Removendo colunas não utilizadas
        df = df.drop(*['paysim_id', 'nameorig', 'namedest'])

        #String Indexing
        string_indexer = StringIndexer(inputCol="type",
                                       outputCol="type_numeric").fit(df)
        df = string_indexer.transform(df)
        df = df.drop(df.type)

        #One-hot encoding
        encoder = OneHotEncoder(inputCol="type_numeric",
                                outputCol="type_vector")
        df = encoder.transform(df)
        df = df.drop("type_numeric")

        #Label encoding
        label_stringIdx = StringIndexer(inputCol=is_fraud,
                                        outputCol='label').fit(df)
        df = label_stringIdx.transform(df)
        df = df.drop(is_fraud)

        #Vector Assembling
        assembler = VectorAssembler(
            inputCols=[x for x in df.columns if x not in ignore],
            outputCol='features')
        df = assembler.transform(df)

        # dataframe in the correct format
        selectedCols = ['label', 'features']
        df = df.select(selectedCols)

        return df
Beispiel #7
0
def oneHotEncodeColumns(df, cols):
    newdf = df
    for c in cols:
        onehotenc = OneHotEncoder(inputCol=c, outputCol=c+"-onehot", dropLast=False)
        newdf = onehotenc.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-onehot", c)
    return newdf
Beispiel #8
0
def add_categorical(df, train=False):
    from pyspark.ml.feature import OneHotEncoder, StringIndexer
    if train:
        indexer = StringIndexer(inputCol='ORIGIN', outputCol='origin_index')
        index_model = indexer.fit(df)
    indexed = index_model.transform(df)
    encoder = OneHotEncoder(inputCol='origin_index', outputCol='origin_onehot')
    return encoder.transform(indexed)
def oneHotEncodeColumns(df, cols):
    from pyspark.ml.feature import OneHotEncoder
    newdf = df
    for c in cols:
        onehotenc = OneHotEncoder(inputCol=c, outputCol=c+"-onehot", dropLast=False)
        newdf = onehotenc.transform(newdf).drop(c)
        newdf = newdf.withColumnRenamed(c+"-onehot", c)
    return newdf
Beispiel #10
0
    def tourTimePredict_train(self):
        '''
        Train a ridge regression model which could predict the tour time given boarding coordinate, getting off
         coordinate and boarding time.
        '''
        # The high 4 numbers of upCoord is longitude and the low 3 numbers is latitude
        onRecordDF = self.upRecord.filter(lambda p: (p[1][0][0] >= 120.5) & (p[1][0][0] <= 122.1) &
                                                    (p[1][0][1] >= 30.4) & (p[1][0][1] <= 31.5))\
                         .map(lambda p: Row(upCoord=int((round(p[1][0][0], 1) - 120.5) * 10 * 12 + (round(p[1][0][1], 1) - 30.4) * 10),
                                            upTime=p[1][1].hour, duration=p[1][2].days * 60 * 24 + p[1][2].seconds/60,
                                            manhLon=abs(p[1][3][0] - p[1][0][0]), manhLat=abs(p[1][3][1] - p[1][0][1])))
        onRecordDF = self.spark.createDataFrame(onRecordDF)

        # generate feature vector
        encoder_time = OneHotEncoder(inputCol='upTime',
                                     outputCol='upTime_onehot',
                                     dropLast=False)
        encoder_coord = OneHotEncoder(inputCol='upCoord',
                                      outputCol='upCoord_onehot',
                                      dropLast=False)
        assembler = VectorAssembler(inputCols=[
            'upTime_onehot', 'upCoord_onehot', 'manhLon', 'manhLat'
        ],
                                    outputCol='features')
        onRecordDF = assembler.transform(
            encoder_coord.transform(encoder_time.transform(onRecordDF)))
        trainSet, validSet, testSet = onRecordDF.randomSplit([7., 1., 2.])

        # train model
        lr = LinearRegression(labelCol='duration', regParam=0.01, maxIter=100)
        self.model = lr.fit(trainSet)
        train_summary = self.model.summary
        print('RMSE of training:', train_summary.rootMeanSquaredError, 'min')
        print('Adjusted R2 of training:', train_summary.r2adj)

        # evaluation
        evaluator = RegressionEvaluator(labelCol='duration')
        model_valid = self.model.transform(validSet)
        print('RMSE on validation set:',
              evaluator.evaluate(model_valid, {evaluator.metricName: 'rmse'}),
              'min')
        model_valid = self.model.transform(testSet)
        print('RMSE on test set:',
              evaluator.evaluate(model_valid, {evaluator.metricName: 'rmse'}),
              'min')
Beispiel #11
0
def onehotencode(df, s1, s2, temp):
    from pyspark.ml.feature import OneHotEncoder, StringIndexer
    stringIndexer = StringIndexer(inputCol=s1, outputCol=temp)
    model = stringIndexer.fit(df)
    indexed = model.transform(df)
    encoder = OneHotEncoder(inputCol=temp, outputCol=s2)
    encoded = encoder.transform(indexed)
    encoded.select(s2).show()
    return encoded
def events(df,column_name):
    i = column_name+"I"
    v = column_name+"V"
    stringIndexer = StringIndexer(inputCol=column_name, outputCol=i)
    model = stringIndexer.fit(df)
    indexed = model.transform(df)
    encoder = OneHotEncoder(inputCol=i, outputCol=v)
    encoded = encoder.transform(indexed)
    return encoded
Beispiel #13
0
def create_catgeory_vars(dataset, field_name):
    idx_col = field_name + "Index"
    col_vec = field_name + "Vec"
    month_stringindexer = StringIndexer(inputCol=field_name, outputCol=idx_col)
    month_model = month_stringindexer.fit(dataset)
    month_indexed = month_model.transform(dataset)
    month_encoder = OneHotEncoder(dropLast=True, inputCol=idx_col, outputCol=col_vec)

    return month_encoder.transform(month_indexed)
def onehot_encoder_usecase():
    spark = getSparkSession()
    df = spark.createDataFrame([(0.0, 1.0), (1.0, 0.0), (2.0, 1.0), (0.0, 2.0),
                                (0.0, 1.0), (2.0, 0.0)],
                               ["categoryIndex1", "categoryIndex2"])

    encoder = OneHotEncoder(inputCol="categoryIndex2",
                            outputCol="categoryVec2")
    encoded = encoder.transform(df)
    encoded.show()
Beispiel #15
0
 def _transform(self,df):
     ohe_columns=[col for col in df.columns if col.startswith('index_')]
     ohe_columns=[col for col in ohe_columns if df.select(col).distinct().count()>2]
     for column in ohe_columns:
         sti=OneHotEncoder(inputCol=column,outputCol='ohe_'+column)
         df=sti.transform(df)
         df=df.drop(column)
     #print(df.columns)
     #df=df.join(label_column)
     return df
Beispiel #16
0
def update_columns(column_list, main_df):
    for column_name in column_list:
        string_indexer = StringIndexer(inputCol=column_name,
                                       outputCol=f'{column_name}_Index')
        model = string_indexer.fit(main_df)
        indexed = model.transform(main_df)

        encoder = OneHotEncoder(inputCol=f'{column_name}_Index',
                                outputCol=f'{column_name}_Vec')
        main_df = encoder.transform(indexed)
    return main_df
def hot_encoding_var(df, feature_names):
    from pyspark.ml.feature import OneHotEncoder
    counter = 0
    for i in feature_names:
        counter+=1
        print("working on feature " + str(counter) + " of " + str(len(feature_names)))
        print("one hot encoding " + str(i) + " feature...")
        encoder = OneHotEncoder(inputCol=i, outputCol= str(i) + "_cd")
        encoded = encoder.transform(df)
        df = encoded
    return df
Beispiel #18
0
def ColToDummiesOHE(df, column):

    stringIndexer = StringIndexer(inputCol=column, outputCol=column + "Index")
    model = stringIndexer.fit(df)
    indexed = model.transform(df)

    encoder = OneHotEncoder(inputCol=column + "Index",
                            outputCol=column + "Vec")
    encoded = encoder.transform(indexed)

    return encoded
    def oneHotEncoding(self, df, input_col):
        stringInd = StringIndexer(inputCol=input_col, outputCol="indexed")
        model = stringInd.fit(df)
        td = model.transform(df)
        encoder = OneHotEncoder(inputCol="indexed", outputCol="features", dropLast=False)
        final_encoding = encoder.transform(td).select(df.id, 'features').cache()
        
        conv_udf = udf(lambda line: Vectors.dense(line).tolist())
        final_encoding = final_encoding.select(df.id,conv_udf(final_encoding.features).alias("num_"+input_col)).cache()

        return final_encoding
Beispiel #20
0
def string_index_fun(col_name):
    stringindexer = StringIndexer(inputCol=col_name,
                                  outputCol=col_name + '_Index')
    model = stringindexer.fit(titanic_df)
    indexed = model.transform(titanic_df)
    # return indexed
    #convert all index columns into vector columns using onehotencoder with column name col_name+vector
    encoder = OneHotEncoder(inputCol=col_name + '_Index',
                            outputCol=col_name + '_Vector')
    encoded = encoder.transform(indexed)
    return encoded
Beispiel #21
0
    def one_hot_encoding(self, df, target):
        label_encoding(self, df, target)
        ohe_columns = [col for col in df.columns if col.startswith('index_')]
        ohe_columns = [
            col for col in ohe_columns if df.select(col).distinct().count() > 2
        ]
        for column in ohe_columns:
            sti = OneHotEncoder(inputCol=column, outputCol='ohe_' + column)
            df = sti.transform(df)
            df = df.drop(column)

            print('One-Hot Encoding completed.')
        return df
Beispiel #22
0
 def onehot_encoding_column(self, column_name):
     self._data_frame = self.label_encoding_column(column_name)
     encoder = OneHotEncoder(dropLast=False,
                             inputCol=column_name + "_ed_label_encoded",
                             outputCol=column_name + "_ed_one_hot_encoded")
     self._data_frame = encoder.transform(self._data_frame)
     self._data_frame = self._data_frame.withColumn(
         column_name + "_ed_one_hot_encoded",
         self._data_frame[column_name +
                          "_ed_one_hot_encoded"].cast('string'))
     self._data_frame = self._data_frame.drop(column_name +
                                              "_ed_label_encoded")
     return self._data_frame
Beispiel #23
0
def findSimillar():

	#Dealing with the server request
	#project_ID = request.args.get('project_ID', None)
	project_ID = 'afd99a01739ad5557b51b1ba0174e832'
	projects.createOrReplaceTempView('projects')

	silhouette = []

	cols = ["Project_Subject_Category_Tree","Project_Subject_Subcategory_Tree","Project_Grade_Level_Category","Project_Resource_Category"]
	colsa = []

	#df = projects.select(cols)
	df = projects

	df = df.where(df.Project_Subject_Category_Tree.isNotNull())
	df = df.where(df.Project_Subject_Subcategory_Tree.isNotNull())
	df = df.where(df.Project_Grade_Level_Category.isNotNull())
	df = df.where(df.Project_Resource_Category.isNotNull())

	for i in range(len(cols)):
		stringIndexer = StringIndexer(inputCol=cols[i], outputCol=cols[i]+"a")
		model = stringIndexer.fit(df)
		df = model.transform(df)
		colsa.append(cols[i]+"a")

	for i in range(len(cols)):
		encoder = OneHotEncoder(inputCol=cols[i]+"a", outputCol=cols[i]+"v")
		encoded = encoder.transform(df)	

		
	assembler = VectorAssembler(
	inputCols=colsa,
	outputCol="features")
	output = assembler.transform(encoded)

	kmax = 10; #optimal K happens at k=4

	for i in range(2,kmax):
		# Trains a k-means model.
		kmeans = KMeans().setK(i).setSeed(1)
		model = kmeans.fit(output)
		# Evaluate clustering by computing Silhouette score
		predictions = model.transform(output)

		evaluator = ClusteringEvaluator()
		silhouette.append([i,evaluator.evaluate(predictions)])

	k_optimal = np.array(silhouette)[int(np.where(np.array(silhouette)[:,1]==np.amax(np.array(silhouette)[:,1]))[0]),0]
	kmeans = KMeans().setK(k_optimal).setSeed(1)
    def cat2Num(self, df, indices):
        """
            Write your code!
        """

        # function to select one feature from a list of feature
        def select_feature(raw_feature, index):
            return raw_feature[index]

        # function to select remove features from a list of feature
        def delete_feature(raw_feature, indices):
            feature = [
                i for j, i in enumerate(raw_feature) if j not in indices
            ]
            return Vectors.dense(feature)

        # Get categorical features and perform One-Hot Encoding
        df_prev = df
        for index in indices:
            select_feature_udf = udf(lambda x: select_feature(x, index),
                                     StringType())
            df_encoded = df_prev.withColumn("cat_" + str(index),
                                            select_feature_udf("rawFeatures"))
            # string index
            stringIndexer = StringIndexer(inputCol="cat_" + str(index),
                                          outputCol="cat_index_" + str(index))
            model_stringIndexer = stringIndexer.fit(df_encoded)
            indexed = model_stringIndexer.transform(df_encoded)

            # one-hot encode
            encoder = OneHotEncoder(inputCol="cat_index_" + str(index),
                                    outputCol="cat_vector_" + str(index),
                                    dropLast=False)
            encoded = encoder.transform(indexed)
            df_prev = encoded

        # Get continious features by removing categorical indices from rawFeatures
        delete_feature_udf = udf(lambda x: delete_feature(x, indices),
                                 VectorUDT())
        df_cont = df_prev.withColumn("cont", delete_feature_udf("rawFeatures"))

        # Combine one-hot encoded categorical and continious features
        feature = []
        for index in indices:
            feature.append("cat_vector_" + str(index))
        feature.append("cont")
        assembler = VectorAssembler(inputCols=feature, outputCol="features")
        df_transformed = assembler.transform(df_cont) \
            .select("id","rawFeatures","features")
        return df_transformed
Beispiel #25
0
def K_means():
	knr = 2;
	cols = ["Project_Subject_Category_Tree","Project_Subject_Subcategory_Tree","Project_Grade_Level_Category","Project_Resource_Category"]
	df = projects.select(cols)
	for i in range(len(cols)):
		stringIndexer = StringIndexer(inputCol=cols[i], outputCol=cols[i]+"a")
		model = stringIndexer.fit(df)
		indexed = model.transform(df)
	for i in range(len(cols)):
		encoder = OneHotEncoder(inputCol=cols[i]+"a", outputCol=cols[i]+"v")
		encoded = encoder.transform(indexed)
	# Trains a k-means model.
	kmeans = KMeans().setK(2).setSeed(1)
	model = kmeans.fit(encoded)
def indexAndEncode(processedData, features):
    encodedFinal = processedData
    for feature in features:

        stringIndexer = StringIndexer(inputCol=feature,
                                      outputCol=feature + "Index")
        model = stringIndexer.fit(
            encodedFinal)  # Input data-frame is the cleaned one from above
        indexed = model.transform(encodedFinal)
        encoder = OneHotEncoder(dropLast=False,
                                inputCol=feature + "Index",
                                outputCol=feature + "Vec")
        encodedFinal = encoder.transform(indexed)
    return encodedFinal
Beispiel #27
0
def get_sdummies(sdf, dummy_columns, keep_top, replace_with='other'):
    """    Index string columns and group all observations that occur in less then a keep_top% of the rows in sdf per column.
    :param sdf: A pyspark.sql.dataframe.DataFrame
    :param dummy_columns: String columns that need to be indexed
    :param keep_top: List [1, 0.8, 0.8]
    :param replace_with: String to use as replacement for the observations that need to be grouped.
    """
    total = sdf.count()
    column_i = 0
    for string_col in dummy_columns:

        # Descending sorting with counts
        sdf_column_count = sdf.groupBy(string_col).count().orderBy(
            'count', ascending=False)
        sdf_column_count = sdf_column_count.withColumn(
            "cumsum",
            F.sum("count").over(Window.partitionBy().orderBy().rowsBetween(
                -sys.maxsize, 0)))

        # Obtain top dummy factors
        sdf_column_top_dummies = sdf_column_count.withColumn(
            "cumperc", sdf_column_count['cumsum'] /
            total).filter(col('cumperc') <= keep_top[column_i])
        keep_list = sdf_column_top_dummies.select(string_col).rdd.flatMap(
            lambda x: x).collect()
        sdf = sdf.withColumn(
            string_col,
            when((col(string_col).isin(keep_list)),
                 col(string_col)).otherwise(replace_with))

        # Apply string indexer
        pipeline = Pipeline(stages=[
            StringIndexer(inputCol=string_col, outputCol="IDX_" + string_col)
        ])
        sdf = pipeline.fit(sdf).transform(sdf)

        encoder = OneHotEncoder(inputCol="IDX_" + string_col,
                                outputCol="ONEHOT_" + string_col)
        encoder.setDropLast(
            True)  # only keep 2^n-n dummies to keep dummy independent.
        sdf = encoder.transform(sdf)

        column_i += 1

    ## Drop intermediate columns
    drop_columns = ["IDX_" + x for x in dummy_columns]  # +  dummy_columns
    sdf = sdf.drop(*drop_columns)

    return sdf
Beispiel #28
0
def one_hot_encode(column, dataframe):
  '''
  Returns a dataframe with an additional one hot encoded column specified on the input
  '''
  from pyspark.ml.feature import OneHotEncoder, StringIndexer
  
  # Indexing the column before one hot encoding
  stringIndexer = StringIndexer(inputCol=column, outputCol='categoryIndex')
  model = stringIndexer.fit(dataframe)
  indexed = model.transform(dataframe)
  
  # One hot encoding the column
  encoder = OneHotEncoder(inputCol='categoryIndex', outputCol=column+'_one_hot')
  encoded = encoder.transform(indexed).drop('categoryIndex')

  return encoded
Beispiel #29
0
def category_to_vactor(idx, df):
    cat_dist = []
    for i in idx:
        cat = df.schema.names[i]
        #StringToIndex
        categoryIndexer = StringIndexer(inputCol=cat, outputCol="Index_" + cat)
        categoryTransformer = categoryIndexer.fit(df)
        new_df = categoryTransformer.transform(df)
        #OneHotEncoder
        encoder = OneHotEncoder(dropLast=False,
                                inputCol="Index_" + cat,
                                outputCol="Vector_" + cat)
        new_df = encoder.transform(new_df)
        df = new_df
        #         cat_dist = [cat_dist,categoryTransformer]
        cat_dist.append(categoryTransformer)
    return (new_df, cat_dist)
    def one_hot_encoding(self, df):
        '''
        Purpose: Encode data using one hot encoding
        Inputs : Data(spark dataframe)
        Output : Encoded data
        '''
        ohe_columns = [col for col in df.columns if col.startswith('index_')]
        ohe_columns = [
            col for col in ohe_columns if df.select(col).distinct().count() > 2
        ]
        for column in ohe_columns:
            sti = OneHotEncoder(inputCol=column, outputCol='ohe_' + column)
            df = sti.transform(df)
            df = df.drop(column)

        print('One-Hot Encoding completed.')

        return df
def oneHot():

    spark = SparkSession \
        .builder \
        .appName("OneHotEncoderExample") \
        .getOrCreate()

    df = spark.createDataFrame([(0, "a"), (1, "b"), (2, "c"), (3, "a"),
                                (4, "a"), (5, "b"), (6, "c")],
                               ["id", "category"])

    stringIndexer = StringIndexer(inputCol="category",
                                  outputCol="categoryIndex")
    model = stringIndexer.fit(df)
    indexed = model.transform(df)

    encoder = OneHotEncoder(inputCol="categoryIndex", outputCol="categoryVec")
    encoded = encoder.transform(indexed)
    encoded.show()
Beispiel #32
0
def main(train_data_folder, model_path):

    #Starting session
    spark = SparkSession.builder.appName('BigDataML').getOrCreate()
    spark.sparkContext.setLogLevel("ERROR")

    #Loading data
    data = spark.read.parquet(train_data_folder)
    data = data.dropna(how='any')

    encoder = OneHotEncoder(inputCol="station_index",
                            outputCol="station_vector")
    data = encoder.transform(data)

    features =\
        VectorAssembler(inputCols=['hour', 'dayofyear', 'month', 'air_temp', 'wind_speed', 'visibility', 'weather_index', "station_vector"], outputCol="features")
    train_data = features.transform(data)

    # Train a GBT model.
    gbt = GBTRegressor(featuresCol="features", maxIter=10)

    # Chain indexer and GBT in a Pipeline
    pipeline = Pipeline(stages=[gbt])

    # Train model.  This also runs the indexer.
    model = pipeline.fit(train_data)
    model.write().overwrite().save(model_path)

    # Make predictions.
    predictions = model.transform(train_data)

    # Select example rows to display.
    predictions.select("prediction", "label", "features").show(5)

    # Select (prediction, true label) and compute test error
    evaluator = RegressionEvaluator(labelCol="label",
                                    predictionCol="prediction",
                                    metricName="r2")
    rmse = evaluator.evaluate(predictions)
    print("R2 on train data = %g" % rmse)

    gbtModel = model.stages[0]
    print(gbtModel)  # summary only
def create_category_vars(dataset, field_name):
    # Create a new column with the suffix "Index" for each variable
    Index_col = field_name + "Index"
    # Create a new column with the suffix "Vec" for each variable
    Column_vec = field_name + "Vec"

    # For each variable return the index corresponding to the value in that variable
    # Define the StringIndex Object
    col_stringIndexer = StringIndexer(inputCol=field_name, outputCol=Index_col)
    # Find the no of indexes for that variable
    model = col_stringIndexer.fit(newFinDepDF)
    # Determine and return the index corresponding to the value in that variables into the columns column_name _ 'Index'
    idx_data = model.transform(newFinDepDF)

    # Using the Indexes returned from StringIndexer build and return the Vector of values for each variable
    encoder = OneHotEncoder(dropLast=True,
                            inputCol=Index_col,
                            outputCol=Column_vec)

    return encoder.transform(idx_data)
# MAGIC Here, we will use a combination of [StringIndexer](http://spark.apache.org/docs/latest/ml-features.html#stringindexer) and [OneHotEncoder](http://spark.apache.org/docs/latest/ml-features.html#onehotencoder) to convert the categorical variables. The OneHotEncoder will return a [SparseVector](https://spark.apache.org/docs/latest/api/python/pyspark.mllib.html#pyspark.mllib.linalg.SparseVector).

# COMMAND ----------

###One-Hot Encoding
from pyspark.ml.feature import OneHotEncoder, StringIndexer, VectorAssembler
  
categoricalColumns = ["workclass", "education", "marital_status", "occupation", "relationship", "race", "sex", "native_country"]
for categoricalCol in categoricalColumns:
  # Category Indexing with StringIndexer
  stringIndexer = StringIndexer(inputCol=categoricalCol, outputCol=categoricalCol+"Index")
  model = stringIndexer.fit(dataset)
  indexed = model.transform(dataset)
  # Use OneHotEncoder to convert categorical variables into binary SparseVectors
  encoder = OneHotEncoder(inputCol=categoricalCol+"Index", outputCol=categoricalCol+"classVec")
  encoded = encoder.transform(indexed)
  dataset = encoded

print dataset.take(1)

# COMMAND ----------

# MAGIC %md
# MAGIC The above code basically indexes each categorical column using the StringIndexer, and then converts the indexed categories into one-hot encoded variables. The resulting output has the binary vectors appended to the end of each row.

# COMMAND ----------

# MAGIC %md
# MAGIC We use the StringIndexer() again here to encode our labels to label indices

# COMMAND ----------
Beispiel #35
0
  def initialize(self, do_scaling=True, do_onehot=True):
    """Reads the dataset, initializes class members.

    features_df: Original DataFrame as read from the features_file.
    train_df: A DataFrame with columns Lat, Lon, Pickup_Count and
        vector columns Features & ScaledFeatures. Contains only data before 2015.
    test_df: As train_df, but only containing data of 2015.
    districts_with_counts: A DataFrame with all districts and their counts.
    """

    # Read feature dataframe
    self.features_df = self.sql_context.read.parquet(self.features_file).cache()

    # Set exclude columns to default
    exclude_columns = self.EXCLUDE_COLUMNS

    # Scale features
    if do_scaling:
      assembler = VectorAssembler(inputCols=self.SCALE_COLUMNS,
                                  outputCol='FeaturesToScale')
      self.features_df = assembler.transform(self.features_df)
      scaler = StandardScaler(inputCol='FeaturesToScale',
                              outputCol=('ScaledFeatures'),
                              withStd=True, withMean=False)
      self.features_df = scaler.fit(self.features_df).transform(self.features_df)

      exclude_columns += self.SCALE_COLUMNS + ['FeaturesToScale']

    # Adopt categorical features that do not have a value range of [0, numCategories)
    for column in ['Day', 'Month', 'Day_Of_Year']:
        if column in self.features_df.columns:
            self.features_df = self.features_df.withColumn(column, self.features_df[column] - 1)

    # Encode categorical features using one-hot encoding
    if do_onehot:
      vec_category_columns = ['%s_Vector' % column for column in self.ONE_HOT_COLUMNS]
      for i in range(len(self.ONE_HOT_COLUMNS)):
        column = self.ONE_HOT_COLUMNS[i]
        if column in self.features_df.columns:
            self.features_df = self.features_df.withColumn(column, self.features_df[column].cast(DoubleType()))
            encoder = OneHotEncoder(inputCol=column,
                                    outputCol=vec_category_columns[i],
                                    dropLast=False)
            self.features_df = encoder.transform(self.features_df)
      exclude_columns += self.ONE_HOT_COLUMNS

    # Vectorize features
    feature_columns = [column for column in self.features_df.columns
                              if column not in exclude_columns]
    assembler = VectorAssembler(inputCols=feature_columns, outputCol='Features')
    self.features_df = assembler.transform(self.features_df)

    # Set number of distinct values for categorical features (identified by index)
    self.categorical_features_info = {}
    if not do_onehot:
        self.categorical_features_info = {i:self.CATEGORY_VALUES_COUNT[feature_columns[i]]
                                          for i in range(len(feature_columns))
                                          if feature_columns[i] in self.CATEGORY_VALUES_COUNT.keys()}

    # Split into train and test data
    split_date = datetime(2015, 1, 1)
    self.train_df = self.features_df.filter(self.features_df.Time < split_date).cache()
    self.test_df = self.features_df.filter(self.features_df.Time > split_date).cache()

    # Compute Districts with counts
    self.districts_with_counts = self.features_df \
                                 .groupBy([self.features_df.Lat, self.features_df.Lon]) \
                                 .count()
  (Vectors.dense(1, 8, 9),3)
]).toDF("features", "label")
indxr = VectorIndexer()\
  .setInputCol("features")\
  .setOutputCol("idxed")\
  .setMaxCategories(2)
indxr.fit(idxIn).transform(idxIn).show()


# COMMAND ----------

from pyspark.ml.feature import OneHotEncoder, StringIndexer
lblIndxr = StringIndexer().setInputCol("color").setOutputCol("colorInd")
colorLab = lblIndxr.fit(simpleDF).transform(simpleDF.select("color"))
ohe = OneHotEncoder().setInputCol("colorInd")
ohe.transform(colorLab).show()


# COMMAND ----------

from pyspark.ml.feature import Tokenizer
tkn = Tokenizer().setInputCol("Description").setOutputCol("DescOut")
tokenized = tkn.transform(sales.select("Description"))
tokenized.show(20, False)


# COMMAND ----------

from pyspark.ml.feature import RegexTokenizer
rt = RegexTokenizer()\
  .setInputCol("Description")\
     WHEN (pickup_hour >= 16 AND pickup_hour <= 19) THEN "PMRush"
    END as TrafficTimeBins
    FROM taxi_test 
"""
taxi_df_test_with_newFeatures = sqlContext.sql(sqlStatement)

## CACHE DATA-FRAME IN MEMORY & MATERIALIZE DF IN MEMORY
taxi_df_test_with_newFeatures.cache()
taxi_df_test_with_newFeatures.count()

## INDEX AND ONE-HOT ENCODING
stringIndexer = StringIndexer(inputCol="vendor_id", outputCol="vendorIndex")
model = stringIndexer.fit(taxi_df_test_with_newFeatures) # Input data-frame is the cleaned one from above
indexed = model.transform(taxi_df_test_with_newFeatures)
encoder = OneHotEncoder(dropLast=False, inputCol="vendorIndex", outputCol="vendorVec")
encoded1 = encoder.transform(indexed)

stringIndexer = StringIndexer(inputCol="rate_code", outputCol="rateIndex")
model = stringIndexer.fit(encoded1)
indexed = model.transform(encoded1)
encoder = OneHotEncoder(dropLast=False, inputCol="rateIndex", outputCol="rateVec")
encoded2 = encoder.transform(indexed)

stringIndexer = StringIndexer(inputCol="payment_type", outputCol="paymentIndex")
model = stringIndexer.fit(encoded2)
indexed = model.transform(encoded2)
encoder = OneHotEncoder(dropLast=False, inputCol="paymentIndex", outputCol="paymentVec")
encoded3 = encoder.transform(indexed)

stringIndexer = StringIndexer(inputCol="TrafficTimeBins", outputCol="TrafficTimeBinsIndex")
model = stringIndexer.fit(encoded3)
df_model=df_ORG
# stringIndexer1 = StringIndexer(inputCol="Origin", outputCol="originIndex")
# model_stringIndexer = stringIndexer1.fit(df_model)
# indexedOrigin = model_stringIndexer.transform(df_model)
# encoder1 = OneHotEncoder(dropLast=False, inputCol="originIndex", outputCol="originVec")
# df_model = encoder1.transform(indexedOrigin)


# In[ ]:

stringIndexer2 = StringIndexer(inputCol="Dest", outputCol="destIndex")
model_stringIndexer = stringIndexer2.fit(df_model)
indexedDest = model_stringIndexer.transform(df_model)
encoder2 = OneHotEncoder(dropLast=False, inputCol="destIndex", outputCol="destVec")
df_model = encoder2.transform(indexedDest)


# We use __labeled point__ to make local vectors associated with a label/response. In MLlib, labeled points are used in supervised learning algorithms and they are stored as doubles. For binary classification, a label should be either 0 (negative) or 1 (positive). 

# In[105]:

assembler = VectorAssembler(
    inputCols = ['Year','Month','DayofMonth','DayOfWeek','Hour','Distance','destVec'],
    outputCol = "features")
output = assembler.transform(df_model)
airlineRDD=output.map(lambda row: LabeledPoint([0,1][row['DepDelayed']],row['features']))


# ### Preprocessing: Spliting dataset into train and test dtasets
		 False if r.attributes['Ambience'] is None else r.attributes['Ambience']['romantic'],
		 False if r.attributes['Ambience'] is None else r.attributes['Ambience']['upscale'],
		 False if r.attributes['Ambience'] is None else r.attributes['Ambience']['casual'],
		 False if (r.attributes['Alcohol'] is None or r.attributes['Alcohol'] == 'none') else True,
		 False if r.attributes['Take-out'] is None else r.attributes['Take-out']]
	).toDF(clustering_columns)

# drop row with null values
lv_clustering_data = lv_clustering_data.dropna()

#Neighborhood feature engineering
stringIndexer = StringIndexer(inputCol="neighborhood", outputCol="neigh_index")
lv_model = stringIndexer.fit(lv_clustering_data)
lv_indexed = lv_model.transform(lv_clustering_data)
encoder = OneHotEncoder(dropLast=False, inputCol="neigh_index", outputCol="neigh_vec")
lv_encoded = encoder.transform(lv_indexed)

#initial feature set
# assembler = VectorAssembler(
#     inputCols=["stars", "price_range", "neigh_vec"],
#     outputCol="features_vec")

#expanded feature set
feature_columns = clustering_columns[2:]
feature_columns.append("neigh_vec")
assembler = VectorAssembler(
    inputCols=feature_columns,
    outputCol="features_vec")

lv_assembled = assembler.transform(lv_encoded)