Python StringIndexer.transform Examples, pyspark.ml.feature.StringIndexer.transform Python Examples

Example #1

0

Show file

File: dist_predict_libsvm_all_1.py Project: happy-lu/spark

def read_csv(spark, file_name):
    sql_context = SQLContext(spark)

    df = sql_context.read.format('com.databricks.spark.csv').options(
        header='true', format="string").load(file_name)

    dateIndexer = StringIndexer(inputCol="date",
                                outputCol="date_index").fit(df)
    serialIndexer = StringIndexer(inputCol="serial_number",
                                  outputCol="serial_number_index").fit(df)
    modelIndexer = StringIndexer(inputCol="model",
                                 outputCol="model_index").fit(df)

    df1 = dateIndexer.transform(df)
    df2 = serialIndexer.transform(df1)
    df3 = modelIndexer.transform(df2)

    df3 = df3.na.fill("0")
    f_cols = df3.columns[5:]
    for name in f_cols:
        # df3 = df3.withColumn(name, "0" if df3[name] == "null" else df3[name])
        df3 = df3.withColumn(name, df3[name].cast("double"))

    # df3.show()

    assembler = VectorAssembler(inputCols=f_cols, outputCol="indexedFeatures")
    df4 = assembler.transform(df3)

    return df4

Example #2

0

Show file

    def __clean_data(self, df, is_fraud="isfraud"):
        ignore = [is_fraud, 'label']

        #Removendo colunas não utilizadas
        df = df.drop(*['paysim_id', 'nameorig', 'namedest'])

        #String Indexing
        string_indexer = StringIndexer(inputCol="type",
                                       outputCol="type_numeric").fit(df)
        df = string_indexer.transform(df)
        df = df.drop(df.type)

        #One-hot encoding
        encoder = OneHotEncoder(inputCol="type_numeric",
                                outputCol="type_vector")
        df = encoder.transform(df)
        df = df.drop("type_numeric")

        #Label encoding
        label_stringIdx = StringIndexer(inputCol=is_fraud,
                                        outputCol='label').fit(df)
        df = label_stringIdx.transform(df)
        df = df.drop(is_fraud)

        #Vector Assembling
        assembler = VectorAssembler(
            inputCols=[x for x in df.columns if x not in ignore],
            outputCol='features')
        df = assembler.transform(df)

        # dataframe in the correct format
        selectedCols = ['label', 'features']
        df = df.select(selectedCols)

        return df

Example #3

0

Show file

def load_csv():
    raw_df = spark.read.format("csv") \
        .option("header", "true") \
        .option("mode", "DROPMALFORMED") \
        .load(csv_path)

    data_df = raw_df.select("Cardholder Last Name",
                            "Cardholder First Initial",
                            "Amount",
                            "Vendor",
                            "Year-Month") \
        .select(
        concat(col("Cardholder Last Name"), lit(" "), col("Cardholder First Initial")).alias("u"),
        concat(col("Vendor")).alias("m"),
        col("Year-Month").alias("date"),
        col("Amount")
    )

    userIndexer = StringIndexer(inputCol="u", outputCol="uid").fit(data_df)
    itemIndexer = StringIndexer(inputCol="m", outputCol="mid").fit(data_df)

    data_df = itemIndexer.transform(userIndexer.transform(data_df)) \
        .withColumn("uid", (col("uid") + 1).cast(FloatType())) \
        .withColumn("mid", (col("mid") + 1).cast(FloatType())) \
        .cache()

    month_seq_udf = udf(lambda s: _date_to_month(s))
    uDF = data_df.select("uid", "u").distinct().orderBy("uid")
    mDF = data_df.select("mid", "m").distinct().orderBy("mid")
    tDF = data_df.filter(data_df["uid"] <= u_limit).filter(data_df["mid"] <= m_limit) \
        .withColumn("month", month_seq_udf(col("date"))) \
        .drop("u", "m")
    return uDF, mDF, tDF

Example #4

0

Show file

File: naive_bayes.py Project: zachdj/elizabeth

def main(train_x,
         train_y,
         test_x,
         test_y=None,
         idf=False,
         ngram=1,
         base='gs',
         asm=False):
    # Load : DF[id, url, features, label?]
    # The DataFrames only have a labels column if labels are given.
    # We drop the text, since Naive Bayes doesn't use it and we already have all the tokens
    kind = 'asm' if asm else 'bytes'
    train = elizabeth.load(train_x, train_y, base=base, kind=kind).drop('text')
    test = elizabeth.load(test_x, test_y, base=base, kind=kind).drop('text')

    # convert the string labels to numeric indices
    # the handleInvalid param allows the label indexer to deal with labels that weren't seen during fitting
    label_indexer = StringIndexer(inputCol='label',
                                  outputCol='indexedLabel',
                                  handleInvalid="skip")
    label_indexer = label_indexer.fit(train)
    train = label_indexer.transform(train)
    # the test set won't always have labels
    if test_y is not None:
        test = label_indexer.transform(test)

    index_labeller = IndexToString(inputCol='prediction',
                                   outputCol='predictedClass',
                                   labels=label_indexer.labels)

    # Train the preprocessor and transform the data.
    prep = elizabeth.Preprocessor()
    prep.add(NGram(n=int(ngram)))
    prep.add(CountVectorizer())
    if idf: prep.add(IDF())
    train = prep.fit(train)
    test = prep.transform(test)

    # Naive Bayes : DF[id, url, text, features, label?, rawPrediction, probability, prediction]
    nb = NaiveBayes(labelCol='indexedLabel').fit(train)
    test = nb.transform(test)
    test = index_labeller.transform(
        test)  # DF[id, url, ... prediction, predictedClass]

    # If labels are given for the test set, print a score.s
    if test_y:
        test = test.orderBy(test.id)
        test = test.withColumn(
            'correct', (test.label == test.predictedClass).cast('double'))
        test = test.select(avg(test.correct))
        print(test.show())

    # If no labels are given for the test set, print predictions.
    else:
        test = test.orderBy(test.id).select(test.predictedClass)
        test = test.rdd.map(lambda row: int(row.predictedClass))
        test = test.toLocalIterator()
        print(*test, sep='\n')

Example #5

0

Show file

def main(spark):
    '''

    Parameters
    ----------
    spark : SparkSession object
    '''
    test_file = 'hdfs:/user/bm106/pub/project/cf_test.parquet'
    test = spark.read.parquet(test_file)
    test.createOrReplaceTempView('test')

    w = Window.partitionBy("user_id")

    def ratio_count(c, w):
        return (col(c) / count(c).over(w))


    test = test.select("user_id", "track_id", ratio_count("count", w).alias("count"))
    test.createOrReplaceTempView('test')
    print("Ratio scores done")

    train_sample = spark.read.parquet('hdfs:/user/dev241/extension4_ratio.parquet')
    train_sample.createOrReplaceTempView('train_sample')
    print("Training sample ext4 loaded")

    StringIndexer = PipelineModel.load('hdfs:/user/dev241/DieterStringIndexer')
    test_idx = StringIndexer.transform(test)
    train_idx = StringIndexer.transform(train_sample)

    #change to best
    rank = 78 
    alpha = 14.287069059772636
    reg = 0.41772043857578584

    model = ALSModel.load("Extension4_ratio")
    print('Model loaded')

    #test ranking metrics
    test_idx = test_idx.select('user_idx','track_idx','count')
    test_users = test_idx.select('user_idx').distinct()
    test_comb = test_idx.groupBy('user_idx').agg(F.collect_set('track_idx').alias('test_labels'))
    track_number = 500
    rec_test = spark.read.parquet('hdfs:/user/dev241/rec_test4.parquet')
    print('Rec test loaded.')
    join = test_comb.join(rec_test,test_comb.user_idx == rec_test.user_idx)
    print('Join done.')
    j4 = join.toDF('user_idx', 'test_labels','user_idx2','recommendations')
    j4.write.parquet("ext4join")
    print('j4 parquet written')
    predictionAndLabels = join.rdd.map(lambda r: ([track.track_idx for track in r.recommendations], r.test_labels))
    print('Map done.')
    metrics = RankingMetrics(predictionAndLabels)
    print('RM done.')
    mavgp = metrics.meanAveragePrecision
    print("Test mean Average Precision : ",mavgp)
    pass

Example #6

0

Show file

def main(spark):
    '''

    Parameters
    ----------
    spark : SparkSession object
    '''

    # File names
    test_file = 'hdfs:/user/bm106/pub/project/cf_test.parquet'
    train_sample_file = 'hdfs:/user/ah3243/extension1_count_greater_1.parquet'

    # Reading the parquet files
    test = spark.read.parquet(test_file)
    train_sample = spark.read.parquet(train_sample_file)

    # StringIndexer
    print("String Indexer entered")
    StringIndexer = PipelineModel.load('hdfs:/user/dev241/DieterStringIndexer')
    test_idx = StringIndexer.transform(test)
    train_idx = StringIndexer.transform(train_sample)
    print("String Indexer done")

    #change to best
    rank = 78
    alpha = 14.287069059772636
    reg = 0.41772043857578584

    #model
    als = ALS(rank=rank,
              alpha=alpha,
              regParam=reg,
              userCol="user_idx",
              itemCol="track_idx",
              ratingCol="count",
              coldStartStrategy="drop",
              implicitPrefs=True)
    model = als.fit(train_idx)
    print("Model fit for test done")
    model.save("Test_Model")
    print("Model save for test done")

    #test ranking metrics
    test_idx = test_idx.select('user_idx', 'track_idx', 'count')
    test_users = test_idx.select('user_idx').distinct()
    test_comb = test_idx.groupBy('user_idx').agg(
        F.collect_set('track_idx').alias('test_labels'))
    track_number = 500
    rec_test = model.recommendForUserSubset(test_users, track_number)
    join = test_comb.join(rec_test, test_comb.user_idx == rec_test.user_idx)
    predictionAndLabels = join.rdd.map(lambda r: (
        [track.track_idx for track in r.recommendations], r.test_labels))
    metrics = RankingMetrics(predictionAndLabels)
    mavgp = metrics.meanAveragePrecision
    print("Test mean Average Precision : ", mavgp)
    pass

Example #7

0

Show file

def indexData(df_sample):
    df_sche = df_sample.schema.fields
    for s in df_sche:
        n = s.name
        if (n != "target") & (n != "id"):
            print(n)
            indexer = StringIndexer(inputCol=n,
                                    outputCol=n + "_index").fit(df_sample)
            df_sample = indexer.transform(df_sample).drop(n)
        elif n == "id":
            indexer = StringIndexer(inputCol=n,
                                    outputCol=n + "_index").fit(df_sample)
            df_sample = indexer.transform(df_sample)
    return df_sample

Example #8

0

Show file

    def train_test(self, df):
        
        df = self.dropNonTCPUDP(df)

        catCols = []
        numCols = ['avg_ipt', 'bytes_in', 'bytes_out', 'entropy', 'total_entropy', 'num_pkts_out', 'num_pkts_in', 'duration']
        labelCol = 'label'

        data = self.get_dummy(df, catCols, numCols, labelCol)
        data.show()

        labelIndexer = StringIndexer(inputCol='label',
                             outputCol='indexedLabel').fit(data)

        labelIndexer.transform(data)

        featureIndexer = VectorIndexer(inputCol="features", \
                                        outputCol="indexedFeatures").fit(data)
        featureIndexer.transform(data)

        (trainingData, testData) = data.randomSplit([0.7, 0.3])
        trainingData.cache()
     #   trainingData.repartition(200)
        testData.cache()
       # testData.repartition(200)
        trainingData.show(5,False)
        testData.show(5,False)

        rf = RandomForestClassifier(featuresCol='indexedFeatures', labelCol='indexedLabel')
        gbt = GBTClassifier(featuresCol='indexedFeatures', labelCol='indexedLabel')
        logr = LogisticRegression(featuresCol='indexedFeatures', labelCol='indexedLabel')

        # Convert indexed labels back to original labels.
        labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=labelIndexer.labels)
        
        pipeline = Pipeline(stages=[labelIndexer, featureIndexer, gbt, labelConverter])
        model = pipeline.fit(trainingData)
        predictions = model.transform(testData)
        # Select example rows to display.
        predictions.select("features","label","predictedLabel", "prediction")

        # Select (prediction, true label) and compute test error
 
        print(self.getTestError(predictions))
        self.printMetrics(predictions)
      #  print(self.ExtractFeatureImp(model.stages[-2].featureImportances, testData, "features"))

        return model

Example #9

0

Show file

def get_sample_data():
    '''
  This function loads and returns the iris datatset for example purposes.
   
  Arguments: None
    
  Returns:
    data {PySpark Dataframe} -- Returns the iris dataset
  '''

    iris = datasets.load_iris()
    data1 = pd.DataFrame(data=np.c_[iris['data'], iris['target']],
                         columns=iris['feature_names'] + ['target'])
    data = spark.createDataFrame(data1)

    # vectorize all numerical columns into a single feature column
    feature_cols = data.columns[:-1]
    assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')
    data = assembler.transform(data)

    # convert text labels into indices
    data = data.select(['features', 'target'])
    label_indexer = StringIndexer(inputCol='target',
                                  outputCol='label').fit(data)
    data = label_indexer.transform(data)

    # only select the features and label column
    data = data.select(['features', 'label'])

    return data

Example #10

0

Show file

File: PredictiveDataTransformation.py Project: sahilsingh1123/predictiveAnalysis

    def isLabelIndexed(self, schemaData, label, dataset):
        isLabelIndexed = "no"
        labelIndexedInfo = {}
        labelIndexer = None
        for schemaVal in schemaData:
            if (str(schemaVal.dataType) == "StringType"
                    and schemaVal.name == label):
                labelIndexer = StringIndexer(
                    inputCol=label,
                    outputCol=PredictiveConstants.INDEXED_ + label,
                    handleInvalid="keep").fit(dataset)
                dataset = labelIndexer.transform(dataset)
                label = PredictiveConstants.INDEXED_ + label
                isLabelIndexed = "yes"
            if (str(schemaVal.dataType) != "StringType"
                    and schemaVal.name == label):
                label = label
                isLabelIndexed = "no"

        labelIndexedInfo.update({
            PredictiveConstants.DATASET: dataset,
            PredictiveConstants.ISLABELINDEXED: isLabelIndexed,
            PredictiveConstants.LABELINDEXER: labelIndexer
        })

        return labelIndexedInfo

Example #11

0

Show file

File: diff_feature_cls.py Project: hongbin0908/bintrade

def run(start1, end1, start2, end2, df, sc, sql_context, is_pred):
    lp_data= get_labeled_points(start1, end2, df, sc, sql_context)
    print lp_data.count()

    labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(lp_data)
    td = labelIndexer.transform(lp_data)
    label2index = {}
    for each in  sorted(set([(i[0], i[1]) for i in td.select(td.label, td.indexedLabel).distinct().collect()]),
                key=lambda x: x[0]):
        label2index[int(each[0])] = int(each[1])
    print label2index

    featureIndexer = \
        VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(lp_data)

    rf = get_model()

    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf])

    lp_train = lp_data.filter(lp_data.date3<end1).filter(lp_data.is_labeled == 1)
    model = pipeline.fit(lp_train)
    lp_check = lp_data.filter(lp_data.date2>start2)
    predictions = model.transform(lp_check)
    predictions = val(predictions, label2index, sql_context)

    if is_pred:
        predictions = predictions.filter(predictions.is_labeled ==0).filter(predictions.date2 == get_cur()).sort(predictions.prob.desc())
        dfToTableWithPar(sql_context, predictions, "predictions", get_cur())
        for each in predictions.take(10):
            print each

Example #12

0

Show file

File: predictor.py Project: shuson/EB5001_BDEA_CA

def predict(doc):
    tweet = Row(source=doc['source'], 
                retweet_count=doc['retweet_count'], 
                favorite_count=doc['favorite_count'], is_retweet=doc['is_retweet'],
                sentiment_compound=sentiment_analyze(doc['text'], "compound"),
                sentiment_neg=sentiment_analyze(doc['text'], "neg"),
                sentiment_neu=sentiment_analyze(doc['text'], "neu"),
                sentiment_pos=sentiment_analyze(doc['text'], "pos"),
                hour=util.convertUTCtoHourOfDay(doc['created_at']),
                day=util.convertUTCtoDay(doc['created_at']),
                week=util.convertUTCtoWeekNumber(doc['created_at']),
                month=util.convertUTCtoMonth(doc['created_at']),
                year=util.convertUTCtoYear(doc['created_at'])
            )
    tweet_df = spark.createDataFrame([tweet])
    
    str_indexer = StringIndexer().setInputCol("source").setOutputCol("source_index").fit(tweet_df)
    tweet_df2 = str_indexer.transform(tweet_df)
    tweet_df3 = tweet_df2.select([col(c).cast("double").alias(c) for c in tweet_df2.columns])
    predictions = model.transform(tweet_df3)
    result = predictions.select("prediction").collect()
    
    if len(result)>0:
        return result[0]['prediction']
    
    return None

Example #13

0

Show file

 def parse_data(self, path_ratings, nrows):
     df_ratings = self.sqlContext.read.csv(path_ratings,
                                           header=True,
                                           quote='"').limit(nrows)
     # self.data.count()
     raw_to_uid = StringIndexer(inputCol="user_id",
                                outputCol="UID").fit(df_ratings)
     self.data = raw_to_uid.transform(df_ratings)
     raw_to_iid = StringIndexer(inputCol="business_id",
                                outputCol="IID").fit(df_ratings)
     self.data = raw_to_iid.transform(self.data)
     # uid and iid must be integers for spark ALS
     self.data = self.data.rdd.map(\
                 lambda r: (int(r['UID']), \
                            int(r['IID']), \
                            float(r['stars'])))\
                 .toDF(("UID", "IID", "stars"))

Example #14

0

Show file

File: load_data.py Project: frank-cao-csun/Restaurant-Recommendation-WebApp

def review_ids_to_number(dataframe):
    #build indexer model for user_id
    indexer_user = StringIndexer(inputCol ="user_id",outputCol="user_id_num").fit(dataframe) 
    indexer_user_save = os.path.join('model','user_ind_model')
    indexer_user.write().overwrite().save(indexer_user_save)
    #build indexer model for business_id
    indexer_business = StringIndexer(inputCol ="business_id",outputCol="business_id_num",handleInvalid="skip").fit(dataframe)
    indexer_business_save = os.path.join('model', 'bus_ind_model')
    indexer_business.write().overwrite().save(indexer_business_save)
    #transform id columns to string
    indexed = indexer_user.transform(dataframe)
    final_indexed = indexer_business.transform(indexed)
    final_indexed.show(20)
    #save fitted strtingIndexer models
    final_indexed_save = os.path.join('dataset','review_vegas_als.parquet')
    final_indexed.write.mode('overwrite').parquet(final_indexed_save)
    logger.error('Indexed dataframe for ALS traing saved to review_vegas_als.parquet')
    logger.error('{} seconds has elapsed'.format(time.time() - start_time))

Example #15

0

Show file

def ProcessData(df):
    df = df.withColumn("label", df["Cancelled"].cast(IntegerType()))
    # categoricalColumns = ['Origin','Dest']

    #Categorical to Continuous/Ordinal/assigning the index
    categoricalColumns = ['Origin', 'Dest']
    for categoricalCol in categoricalColumns:
        stringIndexer = StringIndexer(inputCol=categoricalCol,
                                      outputCol=categoricalCol +
                                      'Index').fit(df)
        df = stringIndexer.transform(df)
    #One Hot Encoder


#        encoder = OneHotEncoderEstimator(inputCols=["OriginIndex", "DestIndex"],
#                                     outputCols=["categoryVec1", "categoryVec2"])
#        model = encoder.fit(df)
#        encoded = model.transform(df)
#        for categoricalCol in categoricalColumns:
#            stringIndexer = StringIndexer(inputCol = categoricalCol, outputCol = categoricalCol + 'Index').fit(df)
#            df=stringIndexer.transform(df)
#
    df = df.withColumn("YearInt", df["Year"].cast(IntegerType()))
    df = df.withColumn("MonthInt", df["Month"].cast(IntegerType()))
    df = df.withColumn("DayofMonthInt", df["DayofMonth"].cast(IntegerType()))
    df = df.withColumn("DayofWeekInt", df["DayOfWeek"].cast(IntegerType()))
    df = df.withColumn("DepTimeInt", df["DepTime"].cast(IntegerType()))
    df = df.withColumn("CRSDepTimeInt", df["CRSDepTime"].cast(IntegerType()))
    df = df.withColumn("ArrTimeInt", df["ArrTime"].cast(IntegerType()))
    df = df.withColumn("CRSArrTimeInt", df["CRSArrTime"].cast(IntegerType()))

    df = df.withColumn("ActualElapsedTimeInt",
                       df["ActualElapsedTime"].cast(IntegerType()))
    df = df.withColumn("CRSElapsedTimeInt",
                       df["CRSElapsedTime"].cast(IntegerType()))
    df = df.withColumn("ArrDelayInt", df["ArrDelay"].cast(IntegerType()))
    df = df.withColumn("DepDelayInt", df["DepDelay"].cast(IntegerType()))
    df = df.withColumn("DistanceInt", df["Distance"].cast(IntegerType()))
    #df= df.withColumn("label", df["Cancelled"].cast(IntegerType()))
    #        encoder = OneHotEncoderEstimator(inputCols=["OriginIndex", "DestIndex"],
    #                                     outputCols=["categoryVec1", "categoryVec2"])
    #        model = encoder.fit(df)
    #        encoded = model.transform(df)
    #
    assembler = VectorAssembler(inputCols=[
        "YearInt", "MonthInt", "DayofMonthInt", "DayofWeekInt", "DepTimeInt",
        "CRSDepTimeInt", "ArrTimeInt", "CRSArrTimeInt", "ActualElapsedTimeInt",
        "CRSElapsedTimeInt", "ArrDelayInt", "DepDelayInt", "OriginIndex",
        "DestIndex", "DistanceInt"
    ],
                                outputCol="features")

    # assembler = VectorAssembler(inputCols=["YearInt","MonthInt","DayofMonthInt","DayofWeekInt","DepTimeInt","CRSDepTimeInt","ActualElapsedTimeInt","CRSElapsedTimeInt","ArrDelayInt","DepDelayInt","OriginIndex","DestIndex","DistanceInt"], outputCol="features")
    df = assembler.transform(df)
    return df
    """============================================================================================================="""

Example #16

0

Show file

def vectorizeData(df, labelsCol, weighClass=False, featsCol=None):
    """Creates dataset from spark DataFrame of mixed categorical and numerical
    features. The function returns only two columns 'label' and 'features'. The 
    input Spark dataframe is 'df'. The column name corresponding to the training 
    labels must be provided in 'labelsCol'."""
    assert labelsCol in df.columns  # 'labelsCol' is not in df.columns
    # Importantly: replace numerical values by zero and categorical values by "NONE" (string)
    df = df.fillna(0).fillna("NONE")
    stringColList = [
        i[0] for i in df.dtypes if (i[1] == 'string' and i[0] != labelsCol)
    ]
    # Indexing categorical features (string types)
    indexedCategoricalCols = [
        categoricalCol + "Index" for categoricalCol in stringColList
    ]
    stages = [
        StringIndexer(
            inputCol=categoricalCol,
            outputCol=idx_categoricalCol,
        ) for categoricalCol, idx_categoricalCol in zip(
            stringColList, indexedCategoricalCols)
    ]
    indexer = Pipeline(stages=stages)
    df = indexer.fit(df).transform(df)

    # Assembling indexed and numeric features
    numericColList = [
        i[0] for i in df.dtypes if (i[1] != 'string' and i[0] != labelsCol)
    ]
    assemblerInputs = indexedCategoricalCols + numericColList
    assembler = VectorAssembler(
        inputCols=assemblerInputs,
        outputCol="features" if featsCol is None else featsCol)
    df = assembler.transform(df)
    # Indexing binary labels
    labeller = StringIndexer(inputCol=labelsCol, outputCol="label").fit(df)
    df = labeller.transform(df).select(
        ["features" if featsCol is None else featsCol, "label"])

    if weighClass:
        from sklearn.utils.class_weight import compute_class_weight as weigh
        labels = [int(i.label) for i in df.select('label').collect()]
        wC0, wC1 = list(
            weigh(class_weight='balanced', classes=[0.0, 1.0], y=labels))
        return assemblerInputs, df.withColumn(
            'weight',
            F.when(df.label == 0.0, wC0).otherwise(wC1))
    else:
        return assemblerInputs, df

Example #17

0

Show file

def main(spark, train_data_file, rank_val, reg, alpha_val, user_indexer_model,
         item_indexer_model, model_file):
    '''
    Parameters
    ----------
    spark : SparkSession object
    data_file : string, path to the parquet file to load
    model_file : string, path to store the serialized model file
    '''

    # Load the parquet file
    train = spark.read.parquet(train_data_file)
    #val = spark.read.parquet(val_data_file)

    #transform data
    indexer_user = StringIndexer(inputCol="user_id",
                                 outputCol="user",
                                 handleInvalid="skip").fit(train)
    indexer_item = StringIndexer(inputCol="track_id",
                                 outputCol="item",
                                 handleInvalid="skip").fit(train)
    als = ALS(userCol='user',
              itemCol='item',
              implicitPrefs=True,
              ratingCol='count',
              rank=rank_val,
              regParam=reg,
              alpha=alpha_val)

    pipeline = Pipeline(stages=[indexer_user, indexer_item, als])
    train = indexer_user.transform(train)
    train = indexer_item.transform(train)
    model = als.fit(train)
    indexer_user.save(user_indexer_model)
    indexer_item.save(item_indexer_model)
    model.save(model_file)

Example #18

0

Show file

File: SADecisionTreeClassifierTest.py Project: sahilsingh1123/dmx_deepinsight_prediction

    def labelIndexing(self, sentimentInfoData):
        labelColm = sentimentInfoData.get(pc.LABELCOLM)
        dataset = sentimentInfoData.get(pc.DATASET)
        indexedLabel = pc.INDEXED_ + labelColm
        #check if the datatype of the col is integer or float or double. if yes then no need to do the indexing.
        '''for now converting each datatypes to string and then indexing it.'''
        dataset = dataset.withColumn(labelColm, dataset[labelColm].cast(StringType()))
        labelIndexer = StringIndexer(inputCol=labelColm, outputCol=indexedLabel,
                                     handleInvalid="keep").fit(dataset)
        dataset = labelIndexer.transform(dataset)
        #storeLabelIndexer = labelIndexer.write().overwrite().save("") # will update this later
        sentimentInfoData.update({
            pc.INDEXEDCOLM: indexedLabel,
            pc.DATASET: dataset
        })

        return sentimentInfoData

Example #19

0

Show file

def random_forest(df, seed, num_of_trees_list):
    # Drop preferred_foot because it's the only categorical column, the others are all numerical
    # Use preferred_foot if we have time to implement it
    df = df.drop("preferred_foot")

    # Create a new column for the team_position label that is numerical instead of categorical
    labelIndexer = StringIndexer(inputCol="team_position", outputCol="indexed_label").fit(df)
    df = labelIndexer.transform(df)

    list_of_features = df.drop("team_position").drop("indexed_label").columns  # Get list of all features
    assembler = VectorAssembler(inputCols=list_of_features, outputCol="indexed_features")
    df = assembler.transform(df)

    (training_data, testing_data) = df.randomSplit([0.8, 0.2], seed)  # Split the training and testing data

    accuracy_list = []
    cm_list = []  # List of confusion matrices
    for num_of_trees in num_of_trees_list:
        random_forest = RandomForestClassifier(labelCol="indexed_label", featuresCol="indexed_features", impurity="entropy", numTrees=num_of_trees, maxDepth=10)
        model = random_forest.fit(training_data)

        predictions = model.transform(testing_data)

        evaluator = MulticlassClassificationEvaluator(labelCol="indexed_label", predictionCol="prediction",
                                                      metricName="accuracy")
        accuracy = evaluator.evaluate(predictions)
        accuracy_list.append(accuracy)

        y_true = predictions.select(['indexed_label']).collect()
        y_pred = predictions.select(['prediction']).collect()
    
        print("Classification report and confusion matrix for Random Forest with " + str(num_of_trees) + " trees:")
        print(classification_report(y_true, y_pred))
        cm = confusion_matrix(y_true, y_pred)
        confusion_matrix_corrected = [[cm[1][1], cm[1][2], cm[1][0]], [cm[2][1], cm[2][2], cm[2][0]],
                                      [cm[0][1], cm[0][2], cm[0][0]]]
        print("")
        print(confusion_matrix_corrected[0])
        print(confusion_matrix_corrected[1])
        print(confusion_matrix_corrected[2])

        cm_list.append(np.array([confusion_matrix_corrected[0], confusion_matrix_corrected[1], confusion_matrix_corrected[2]]))

    return accuracy_list, cm_list

Example #20

0

Show file

File: mimic_hp_training_scale.py Project: dhlee4/AcuteOrganFailureInterventionModel

    def add_demo(self):
        import pyspark
        try:
            return self.spark.read.parquet(self.cur_demo_file_name).withColumnRenamed("HADM_ID", "ID")
        except pyspark.sql.utils.AnalysisException as ex:

            template = "An exception of type {0} occurred. Arguments:\n{1!r}"
            message = template.format(type(ex).__name__, ex.args)
            self.logger.info(message)
            self.logger.info("PROCESS")

            from pyspark.sql.functions import datediff,col
            from pyspark.ml.feature import OneHotEncoder, StringIndexer
            from pyspark.ml.feature import VectorAssembler
            cur_demo = self.spark.read.parquet(self.data_dir + "/ADMISSIONS").select("SUBJECT_ID", "HADM_ID", "ADMITTIME", "ADMISSION_TYPE", "ADMISSION_LOCATION", "INSURANCE", "LANGUAGE", "RELIGION", "MARITAL_STATUS", "ETHNICITY")
            cur_pts = self.spark.read.parquet(self.data_dir + "/PATIENTS").select("SUBJECT_ID", "DOB", "GENDER")
            merged_demo = cur_demo.join(cur_pts,"SUBJECT_ID").drop("SUBJECT_ID")
            merged_demo = merged_demo.withColumn("AGE",datediff("ADMITTIME","DOB")/365.0).withColumn("AGE",when(col("AGE")>90,90).otherwise(col("AGE"))).drop("ADMITTIME","DOB").where("AGE > 18").fillna("N/A")

            target_col = merged_demo.columns
            target_col.remove("AGE")
            target_col.remove("HADM_ID")
            target_col.sort()
            self.logger.debug(target_col)
            vector_target = ["AGE"]
            demo_col_list = ["AGE"]
            for cat_col in target_col:
                SI_model= StringIndexer(inputCol=cat_col, outputCol="SI_{0}".format(cat_col)).fit(merged_demo)
                demo_col_list = demo_col_list+[demo_var+"||"+demo_info for demo_var, demo_info in (zip([cat_col]*len(SI_model.labels),SI_model.labels))]
                merged_demo = SI_model.transform(merged_demo)
                merged_demo = OneHotEncoder(inputCol="SI_{0}".format(cat_col),outputCol="OH_{0}".format(cat_col), dropLast=False).transform(merged_demo)
                vector_target.append("OH_{0}".format(cat_col))

            import json
            json.dump({"demo_feature":demo_col_list},open(self.json_demo_feature_dump_loc,"w"))
            sorted(vector_target)
            self.logger.debug( vector_target)
            return_df = VectorAssembler(inputCols=vector_target,outputCol="demo_feature").transform(merged_demo)
            return_df.write.save(self.cur_demo_file_name)
            return_df = self.spark.read.parquet(self.cur_demo_file_name).withColumnRenamed("HADM_ID", "ID").select("ID","demo_feature")
            return return_df

Example #21

0

Show file

File: naive_bayes.py Project: darkheros12/SOEN_471_Project

def naive_bayes(df, seed):
    # Drop preferred_foot because it's the only categorical column, the others are all numerical
    # Use preferred_foot if we have time to implement it
    df = df.drop("preferred_foot")

    labelIndexer = StringIndexer(inputCol="team_position", outputCol="label").fit(df)
    df = labelIndexer.transform(df)
    df = df.drop("team_position")

    list_of_features = df.drop("label").columns  # Get list of all features
    assembler = VectorAssembler(inputCols=list_of_features, outputCol="features")
    df = assembler.transform(df)

    (train_data, test_data) = df.randomSplit([0.8, 0.2], seed)

    n_bayes = NaiveBayes(smoothing=1.0, modelType="multinomial")

    model = n_bayes.fit(train_data)  # Training happens here

    predictions = model.transform(test_data)

    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)

    y_true = predictions.select(['label']).collect()
    y_pred = predictions.select(['prediction']).collect()

    print("Classification report and confusion matrix for Naive Bayes:")
    print(classification_report(y_true, y_pred))
    cm = confusion_matrix(y_true, y_pred)
    confusion_matrix_corrected = [[cm[1][1], cm[1][2], cm[1][0]], [cm[2][1], cm[2][2], cm[2][0]],
                                  [cm[0][1], cm[0][2], cm[0][0]]]
    print("")
    print(confusion_matrix_corrected[0])
    print(confusion_matrix_corrected[1])
    print(confusion_matrix_corrected[2])

    cm = np.array([confusion_matrix_corrected[0], confusion_matrix_corrected[1], confusion_matrix_corrected[2]])

    return accuracy, cm

Example #22

0

Show file

File: mainUtilities.py Project: sahilsingh1123/SentimentAnalysis

    def stringIndexer(infoData):
        colmToIndex = infoData.get(mc.COLMTOINDEX)
        dataset = infoData.get(mc.DATASET)
        indexedColm = infoData.get(mc.INDEXEDCOLM)
        storageLocation = infoData.get(mc.STORAGELOCATION)
        indexerName = colmToIndex + mc.INDEXER
        file = storageLocation + indexerName
        # check if the datatype of the col is integer or float or double. if yes then no need to do the indexing-- sahil.
        '''for now converting each datatypes to string and then indexing it.'''
        dataset = dataset.withColumn(colmToIndex, dataset[colmToIndex].cast(StringType()))
        stringIndexer = StringIndexer(inputCol=colmToIndex, outputCol=indexedColm,
                                     handleInvalid="keep").fit(dataset)
        dataset = stringIndexer.transform(dataset)
        stringIndexer.write().overwrite().save(file)  # will update this later
        indexerPathMapping = infoData.get(mc.INDEXERPATHMAPPING)
        indexerPathMapping.update({colmToIndex: file})
        infoData.update({
            mc.INDEXERPATHMAPPING: indexerPathMapping,
            mc.DATASET: dataset
        })

        return infoData

Example #23

0

Show file

File: spark.py Project: cmu-transparency/lib-iris

def index_nominals(dataframe, renamer=lambda string: u"indexed_%s" % string):
    """Create indexed versions of nominal features in the given dataframe."""

    all_cols = dataframe.columns

    schema = dataframe.schema
    names_by_idx = [str(name) for name in schema.names]
    types_by_idx = [field.dataType for field in schema.fields]
    labels_by_idx = [[] for idx in range(len(all_cols))]

    dataframe_indexing = dataframe

    # The new (or old if not indexed) column names.
    columns = []

    # Fit and apply a sequence of nominal feature indexer.
    for idx, col in enumerate(all_cols):
        if types_by_idx[idx] is StringType():
            # Encode nominal features into doubles.
            indexer = StringIndexer(
                inputCol=col, outputCol=renamer(col)).fit(dataframe_indexing)

            labels_by_idx[idx] = indexer.labels
            dataframe_indexing = indexer.transform(dataframe_indexing)
            columns.append(renamer(col))
        else:
            labels_by_idx[idx] = []
            columns.append(col)

    # Create the object that holds the information necessary to get
    # column and value names for the various converted features and
    # values.
    namer = ValueMapper(columns_by_idx=names_by_idx,
                        types_by_idx=types_by_idx,
                        values_by_idx=labels_by_idx)

    return dataframe_indexing, columns, namer

Example #24

0

Show file

File: main_parallel.py Project: BingZou/RecommenderSystem

def get_rs(args):

    from pyspark.ml.evaluation import RegressionEvaluator
    from pyspark.ml.recommendation import ALS
    from pyspark.ml.feature import StringIndexer
    from pyspark.sql import SparkSession
    import random
    import string

    def id_generator(size=6, chars=string.ascii_uppercase + string.digits):
       return ''.join(random.choice(chars) for _ in range(size))

    spark = SparkSession.builder.appName('Session_%s' %id_generator()).getOrCreate()
    df_train = spark.read.parquet("./cf_train_subsampled.parquet")
    df_val = spark.read.parquet("./cf_validation_subsampled.parquet")
    df_test = spark.read.parquet("./cf_test_subsampled.parquet")

    # train contains all user, but not all tracks
    user_indexer = StringIndexer(inputCol="user_id", outputCol="user_id_numeric").fit(df_train)
    track_indexer = StringIndexer(inputCol="track_id", outputCol="track_id_numeric").fit(df_train.union(df_val))

    df_train = user_indexer.transform(df_train)
    df_train = track_indexer.transform(df_train)
    df_val = user_indexer.transform(df_val)
    df_val = track_indexer.transform(df_val)
    df_test = user_indexer.transform(df_test)
    df_test = track_indexer.transform(df_test)

    rank,regParam,alpha = args
    als = ALS(rank=rank, maxIter=10, regParam=regParam, alpha=alpha, implicitPrefs = True,
          userCol="user_id_numeric", itemCol="track_id_numeric", ratingCol="count",
                coldStartStrategy="drop")
    #model = als.trainImplicit(df_train)
    model = als.fit(df_train)

    # Evaluate the model by computing the RMSE on the test data
    predictions = model.transform(df_val)
    evaluator = RegressionEvaluator(metricName="rmse", labelCol="count",
                                    predictionCol="prediction")
    rmse = evaluator.evaluate(predictions)
    return [rank,regParam,alpha,rmse]

Example #25

0

Show file

File: H2.py Project: zhangshun97/Advanced-Big-Data-Analysis

                      sep="\t",
                      inferSchema="true",
                      header="false")
test = spark.read.load("hdfs://10.190.2.112/data/test_set.txt",
                       format="csv",
                       sep="\t",
                       inferSchema="true",
                       header="false")

# only for feature transform
total = train.union(val).union(test)

# create features
indexer = StringIndexer(inputCol="_c2", outputCol="c22")
indexer = indexer.fit(total)
train = indexer.transform(train)
val = indexer.transform(val)
test = indexer.transform(test)
# create label
indexer = StringIndexer(inputCol="_c6", outputCol="label")
indexer = indexer.fit(total)
train = indexer.transform(train)
val = indexer.transform(val)
test = indexer.transform(test)
# One-hot encoder
encoder = OneHotEncoder(inputCol="c22", outputCol="c2")
train = encoder.transform(train)
val = encoder.transform(val)
test = encoder.transform(test)

# create the trainer and set its parameters

Example #26

0

Show file

File: Amazon_recommendation.py Project: RishikaMachina/Amazon_Product_Recommendation

df = df.select('asin','reviewerID','overall')
df.printSchema()
df.show()

#encoding ID's to fit in model
from pyspark.ml.feature import StringIndexer

a = StringIndexer(inputCol="reviewerID", outputCol="reviewerIDIndex",  handleInvalid='skip')
r = a.fit(df)
indexedDf = r.transform(df)
indexedDf.show()

asinIndexer = StringIndexer(inputCol="asin", outputCol="asinIndex",handleInvalid='skip')
a = asinIndexer.fit(df)
indexedDf = a.transform(indexedDf)
indexedDf.show()

from pyspark.sql.types import IntegerType
from pyspark.sql.functions import regexp_replace

indexedDf = indexedDf.withColumn("reviewerID", indexedDf["reviewerIDIndex"].cast(IntegerType()))
indexedDf = indexedDf.withColumn("asin",indexedDf["asinIndex"].cast(IntegerType()))
#indexedDf.show()

#indexedDf.toPandas().to_csv(indexedDf.csv, header=True, index=False)
indexedDf = indexedDf.select('asin','reviewerID','overall')

indexedDf.show()

print(indexedDf.count())

Example #27

0

Show file

# COMMAND ----------

data.show(5)

# COMMAND ----------

from pyspark.ml.feature import StringIndexer

# COMMAND ----------

indexer = StringIndexer(inputCol='Cruise_line',
                        outputCol='cruise_idx').fit(data)

# COMMAND ----------

data = indexer.transform(data)

# COMMAND ----------

data.show(5)

# COMMAND ----------

from pyspark.ml import linalg
from pyspark.ml.feature import VectorAssembler

# COMMAND ----------

vector = VectorAssembler(inputCols=[
    'Age', 'Tonnage', 'passengers', 'length', 'cabins', 'passenger_density',
    'cruise_idx'

Example #28

0

Show file

testing = testing.withColumn("Fare2", testing["Fare"].cast(DoubleType()))
testing = testing.fillna(-1, subset=["Pclass2", "SibSp2", "Parch2", "Fare2"])
testing = testing.fillna(29.67, subset=["Age2"])

# COMMAND ----------

display(training)

# COMMAND ----------

labelIndexer = StringIndexer(inputCol="Survived",
                             outputCol="indexedLabel").fit(training)

# COMMAND ----------

trainingFeatureTest = labelIndexer.transform(training)
display(trainingFeatureTest.select("Survived", "indexedLabel"))

# COMMAND ----------

featureIndexer1 = StringIndexer(inputCol="Sex",
                                outputCol="feature1").fit(training)

# COMMAND ----------

trainingFeatureTest = featureIndexer1.transform(trainingFeatureTest)
display(
    trainingFeatureTest.select("Survived", "indexedLabel", "Sex", "feature1"))

# COMMAND ----------

Example #29

0

Show file

features = ['Price', 'Date of Transfer', 'Property Type', 'Old/New', 'Town/City', 'District', 'County']
data = data.select(features)
# convert all selected string columns into integers
date_indexer = StringIndexer(inputCol='Date of Transfer', outputCol='Date_of_TransferIndexed')
date_indexer = date_indexer.fit(data)
property_type_indexer = StringIndexer(inputCol='Property Type', outputCol='Property_typeIndexed')
property_type_indexer = property_type_indexer.fit(data)
olde_new_indexer = StringIndexer(inputCol='Old/New', outputCol='Old_NewIndexed')
olde_new_indexer = olde_new_indexer.fit(data)
town_indexer = StringIndexer(inputCol='Town/City', outputCol='TownIndexed')
town_indexer = town_indexer.fit(data)
district_indexer = StringIndexer(inputCol='District', outputCol='DistrictIndexed')
district_indexer = district_indexer.fit(data)
county_indexer = StringIndexer(inputCol='County', outputCol='CountyIndexed')
county_indexer = county_indexer.fit(data)
data = date_indexer.transform(data)
data = property_type_indexer.transform(data)
data = olde_new_indexer.transform(data)
data = town_indexer.transform(data)
data = district_indexer.transform(data)
data = county_indexer.transform(data)
data.show
assembler=VectorAssembler(inputCols=['Date_of_TransferIndexed', 'CountyIndexed'],outputCol='features')
output=assembler.transform(data)
final_data=output.select('features','Price')
train_data,test_data=final_data.randomSplit([0.7,0.3])

lr=LinearRegression(labelCol='Price')
lr_model=lr.fit(train_data)

# save results

Example #30

0

Show file

File: Airline Delays - Data Processing Pipeline.py Project: stevendleung/Predicting_Airline_Delays

# cached join data to reduce processing
cached_join = weather_airline_joined.cache()

# perform train/test split based on year
train_set = filter_to_train(cached_join).cache()
test_set = filter_to_test(cached_join).cache()

# COMMAND ----------

# Index label
labelIndexer = StringIndexer(
    inputCol="dep_del15",
    outputCol="label").setHandleInvalid("keep").fit(train_set)

train_set = labelIndexer.transform(train_set)
test_set = labelIndexer.transform(test_set)

# Index features
categorical = [
    "month", "day_of_week", "op_unique_carrier", "Holiday",
    "PREVIOUS_FLIGHT_DELAYED_FOR_MODELS", "origin_WND_direction_angle",
    "origin_WND_type_code", "origin_CIG_ceiling_visibility_okay",
    "origin_VIS_variability", "dest_WND_direction_angle", "dest_WND_type_code",
    "dest_CIG_ceiling_visibility_okay", "dest_VIS_variability", "crs_dep_hour",
    'distance_group', 'origin_airport_id'
]

categorical_index = [i + "_Index" for i in categorical]

stringIndexer = StringIndexer(

Example #31

0

Show file

File: Football_Predictor_Spark.py Project: GillesVandewiele/FootballPredictor

train_feature_df = feature_df.filter(feature_df['time'] <= split_time)
test_feature_df = feature_df.filter(feature_df['time'] > split_time)

train_feature_df = train_feature_df.drop('time')
test_feature_df = test_feature_df.drop('time')

assembler = VectorAssembler(
    inputCols=list(set(train_feature_df.columns) - set(['result', 'home_name', 'away_name'])),
    outputCol="features")

train_df = assembler.transform(train_feature_df)
test_df = assembler.transform(test_feature_df)

labelIndexer = StringIndexer(inputCol="result", outputCol="indexedResult").fit(feature_df)

train_df = labelIndexer.transform(train_df)
test_df = labelIndexer.transform(test_df)

label_mapping = dict(enumerate(labelIndexer.labels()))
reverse_mapping = {}
for key in label_mapping:
    reverse_mapping[label_mapping[key]] = key


# ## Dimensionality reduction
# 
# Feature selection is not really supported yet in mllib, therefore, we just applied dim reduction using PCA

# In[509]:

pca = PCA(inputCol="features", outputCol="pca", k=15).fit(train_df)

Example #32

0

Show file

File: applyModelSpark.py Project: timjerman/AdLoadingMiner

def applyModel(fileName, loadModelName, outlierPercentile = 100):

    sc = SparkContext( 'local', 'pyspark')
    sqlContext = SQLContext(sc)

    #########
    # load data
    #########

    data = sc.textFile(fileName)
    #extract header and remove it
    header = data.first()
    data = data.filter(lambda x:x !=header).cache()
    header = header.split('\t')
    #parse data
    data = data.map(lambda x : x.split('\t'))

    #########
    # prepare features
    #########

    df = sqlContext.createDataFrame(data, header)
    df = (df.withColumn("ADLOADINGTIME",func.regexp_replace('ADLOADINGTIME', 'null', '0').cast('float'))
         .withColumn("TIMESTAMP",func.regexp_replace('TIMESTAMP', 'null', '0').cast('int'))
         .withColumn("GEOIP_LAT",func.regexp_replace('GEOIP_LAT', 'null', '0').cast('int'))
          .withColumn("GEOIP_LNG",func.regexp_replace('GEOIP_LNG', 'null', '0').cast('int'))
          .withColumn("HOSTWINDOWHEIGHT",func.regexp_replace('HOSTWINDOWHEIGHT', 'null', '0').cast('int'))
          .withColumn("HOSTWINDOWWIDTH",func.regexp_replace('HOSTWINDOWWIDTH', 'null', '0').cast('int'))
          .withColumn("TOPMOSTREACHABLEWINDOWHEIGHT",func.regexp_replace('TOPMOSTREACHABLEWINDOWHEIGHT', 'null', '0').cast('int'))
          .withColumn("TOPMOSTREACHABLEWINDOWWIDTH",func.regexp_replace('TOPMOSTREACHABLEWINDOWWIDTH', 'null', '0').cast('int'))
         )
    thr = np.percentile(df.select("ADLOADINGTIME").rdd.collect(), outlierPercentile)
    df = df.filter(func.col('ADLOADINGTIME') < thr)
    df = df.withColumn("TOPMOSTREACHABLEWINDOWAREA", func.col("TOPMOSTREACHABLEWINDOWHEIGHT")*func.col("TOPMOSTREACHABLEWINDOWWIDTH"))
    df = df.withColumn("INTENDENTISACTUALDEVICETYPE", (func.col("ACTUALDEVICETYPE")==func.col("INTENDEDDEVICETYPE")).cast('int'))
    df = df.withColumn("COMBINEDID", 
            func.concat(
                func.col('ACCOUNTID'), 
                func.col('CAMPAIGNID'), 
                func.col('CREATIVEID'), 
                func.col('SDK')) )

    #df = df.withColumn("COMBINEDID", func.regexp_replace("COMBINEDID", '^$', 'NA'))

    df = df.withColumn("COMBINEDEXTERNALID", 
            func.concat( 
                func.regexp_replace('EXTERNALADSERVER', 'null', ''), 
                func.regexp_replace('EXTERNALPLACEMENTID', 'null', ''), 
                func.regexp_replace('EXTERNALSITEID', 'null', ''), 
                func.regexp_replace('EXTERNALSUPPLIERID', 'null', '') ))

    #df = df.withColumn("COMBINEDEXTERNALID", func.regexp_replace("COMBINEDEXTERNALID", '^$', 'NA'))

    df = df.withColumn("PLATFORMCOMBINED", 
            func.concat( 
                func.regexp_replace('PLATFORM', 'null', ''), 
                func.regexp_replace('PLATFORMVERSION', 'null', '') ))

    #df = df.withColumn("PLATFORMCOMBINED", func.regexp_replace("PLATFORMCOMBINED", '^$', 'NA'))

    df = df.withColumn("UA_OSCOMB", 
            func.concat( 
                func.regexp_replace('UA_OS', 'null', ''), 
                func.regexp_replace('UA_OSVERSION', 'null', '') ))

    #df = df.withColumn("UA_OSCOMB", func.regexp_replace("UA_OSCOMB", '^$', 'NA'))
    df = df.withColumn("FILESJSON_SIZE", 
                func.regexp_replace('FILESJSON', '[^,\d]', '') )

    df = df.withColumn("FILESJSON_SIZE", 
                func.regexp_replace('FILESJSON_SIZE', '^,', '') )

    df = df.withColumn("FILESJSON_SIZE", 
                func.regexp_replace('FILESJSON_SIZE', ',,', ',') )

    udf = func.udf(lambda x: int(np.fromstring(x,dtype=int, sep=',').sum()), IntegerType())
    df = df.withColumn("FILESJSON_SIZE", udf("FILESJSON_SIZE"))

    print('Loaded and prapared %d entries' % df.count())

    #########
    # keep only needed features
    #########   

    features = ['ADLOADINGTIME',
     'PLACEMENTID',
     'TIMESTAMP',
     'CREATIVETYPE',
     'UA_HARDWARETYPE',
     'UA_VENDOR',
     'UA_MODEL',
     'UA_BROWSER',
     'UA_BROWSERVERSION',
     'FILESJSON',
     'ERRORSJSON',
     'TOPMOSTREACHABLEWINDOWAREA',
     'FILESJSON_SIZE',
     'COMBINEDID',
     'COMBINEDEXTERNALID',
     'PLATFORMCOMBINED',
     'UA_OSCOMB',
     'SDK',
     'EXTERNALADSERVER'
       ]

    df = df.select(features)

    #########
    # Convert categorical features to numerical
    #########   


    featuresCat = [
     'PLACEMENTID',
     'CREATIVETYPE',
     'UA_HARDWARETYPE',
     'UA_VENDOR',
     'UA_MODEL',
     'UA_BROWSER',
     'UA_BROWSERVERSION',
     'FILESJSON',
     'ERRORSJSON',
     'COMBINEDID',
     'COMBINEDEXTERNALID',
     'PLATFORMCOMBINED',
     'UA_OSCOMB',
     'SDK',
     'EXTERNALADSERVER'
       ]

    for i in range(len(featuresCat)):

        indexer = StringIndexer(inputCol=featuresCat[i], outputCol='_'+featuresCat[i]).setHandleInvalid("skip").fit(df)
        df = indexer.transform(df).drop(featuresCat[i])
        writer = indexer._call_java("write")
        writer.overwrite().save("indexer_" + featuresCat[i])    

    featuresCat = [ '_' + featuresCat[i] for i in range(len(featuresCat))]    

    features = featuresCat[:]
    features.append('TIMESTAMP')    
    features.append('FILESJSON_SIZE')
    features.append('TOPMOSTREACHABLEWINDOWAREA')


    #########
    # Assemble features
    #########   


    assembler = VectorAssembler(
        inputCols=features,
        outputCol="features")

    df = assembler.transform(df)

    #########
    # Convert to labeled point
    #########   


    lp = (df.select(func.col("ADLOADINGTIME").alias("label"), func.col("features"))
      .map(lambda row: LabeledPoint(row.label, row.features)))
    lp.cache()


    #########
    # Load trained model
    #########
    
    model = RandomForestModel.load(sc, loadModelName)
    
    print('Model loaded!')
    
    predictions = model.predict(lp.map(lambda x: x.features)).collect()
    
    return predictions