Ejemplos de StringIndexer.transform en Python, ejemplos de pyspark.ml.feature.StringIndexer.transform en Python

Ejemplo n.º 1

0

Mostrar archivo

Archivo: dist_predict_libsvm_all_1.py Proyecto: happy-lu/spark

def read_csv(spark, file_name):
    sql_context = SQLContext(spark)

    df = sql_context.read.format('com.databricks.spark.csv').options(
        header='true', format="string").load(file_name)

    dateIndexer = StringIndexer(inputCol="date",
                                outputCol="date_index").fit(df)
    serialIndexer = StringIndexer(inputCol="serial_number",
                                  outputCol="serial_number_index").fit(df)
    modelIndexer = StringIndexer(inputCol="model",
                                 outputCol="model_index").fit(df)

    df1 = dateIndexer.transform(df)
    df2 = serialIndexer.transform(df1)
    df3 = modelIndexer.transform(df2)

    df3 = df3.na.fill("0")
    f_cols = df3.columns[5:]
    for name in f_cols:
        # df3 = df3.withColumn(name, "0" if df3[name] == "null" else df3[name])
        df3 = df3.withColumn(name, df3[name].cast("double"))

    # df3.show()

    assembler = VectorAssembler(inputCols=f_cols, outputCol="indexedFeatures")
    df4 = assembler.transform(df3)

    return df4

Ejemplo n.º 2

0

Mostrar archivo

    def __clean_data(self, df, is_fraud="isfraud"):
        ignore = [is_fraud, 'label']

        #Removendo colunas não utilizadas
        df = df.drop(*['paysim_id', 'nameorig', 'namedest'])

        #String Indexing
        string_indexer = StringIndexer(inputCol="type",
                                       outputCol="type_numeric").fit(df)
        df = string_indexer.transform(df)
        df = df.drop(df.type)

        #One-hot encoding
        encoder = OneHotEncoder(inputCol="type_numeric",
                                outputCol="type_vector")
        df = encoder.transform(df)
        df = df.drop("type_numeric")

        #Label encoding
        label_stringIdx = StringIndexer(inputCol=is_fraud,
                                        outputCol='label').fit(df)
        df = label_stringIdx.transform(df)
        df = df.drop(is_fraud)

        #Vector Assembling
        assembler = VectorAssembler(
            inputCols=[x for x in df.columns if x not in ignore],
            outputCol='features')
        df = assembler.transform(df)

        # dataframe in the correct format
        selectedCols = ['label', 'features']
        df = df.select(selectedCols)

        return df

Ejemplo n.º 3

0

Mostrar archivo

def load_csv():
    raw_df = spark.read.format("csv") \
        .option("header", "true") \
        .option("mode", "DROPMALFORMED") \
        .load(csv_path)

    data_df = raw_df.select("Cardholder Last Name",
                            "Cardholder First Initial",
                            "Amount",
                            "Vendor",
                            "Year-Month") \
        .select(
        concat(col("Cardholder Last Name"), lit(" "), col("Cardholder First Initial")).alias("u"),
        concat(col("Vendor")).alias("m"),
        col("Year-Month").alias("date"),
        col("Amount")
    )

    userIndexer = StringIndexer(inputCol="u", outputCol="uid").fit(data_df)
    itemIndexer = StringIndexer(inputCol="m", outputCol="mid").fit(data_df)

    data_df = itemIndexer.transform(userIndexer.transform(data_df)) \
        .withColumn("uid", (col("uid") + 1).cast(FloatType())) \
        .withColumn("mid", (col("mid") + 1).cast(FloatType())) \
        .cache()

    month_seq_udf = udf(lambda s: _date_to_month(s))
    uDF = data_df.select("uid", "u").distinct().orderBy("uid")
    mDF = data_df.select("mid", "m").distinct().orderBy("mid")
    tDF = data_df.filter(data_df["uid"] <= u_limit).filter(data_df["mid"] <= m_limit) \
        .withColumn("month", month_seq_udf(col("date"))) \
        .drop("u", "m")
    return uDF, mDF, tDF

Ejemplo n.º 4

0

Mostrar archivo

Archivo: naive_bayes.py Proyecto: zachdj/elizabeth

def main(train_x,
         train_y,
         test_x,
         test_y=None,
         idf=False,
         ngram=1,
         base='gs',
         asm=False):
    # Load : DF[id, url, features, label?]
    # The DataFrames only have a labels column if labels are given.
    # We drop the text, since Naive Bayes doesn't use it and we already have all the tokens
    kind = 'asm' if asm else 'bytes'
    train = elizabeth.load(train_x, train_y, base=base, kind=kind).drop('text')
    test = elizabeth.load(test_x, test_y, base=base, kind=kind).drop('text')

    # convert the string labels to numeric indices
    # the handleInvalid param allows the label indexer to deal with labels that weren't seen during fitting
    label_indexer = StringIndexer(inputCol='label',
                                  outputCol='indexedLabel',
                                  handleInvalid="skip")
    label_indexer = label_indexer.fit(train)
    train = label_indexer.transform(train)
    # the test set won't always have labels
    if test_y is not None:
        test = label_indexer.transform(test)

    index_labeller = IndexToString(inputCol='prediction',
                                   outputCol='predictedClass',
                                   labels=label_indexer.labels)

    # Train the preprocessor and transform the data.
    prep = elizabeth.Preprocessor()
    prep.add(NGram(n=int(ngram)))
    prep.add(CountVectorizer())
    if idf: prep.add(IDF())
    train = prep.fit(train)
    test = prep.transform(test)

    # Naive Bayes : DF[id, url, text, features, label?, rawPrediction, probability, prediction]
    nb = NaiveBayes(labelCol='indexedLabel').fit(train)
    test = nb.transform(test)
    test = index_labeller.transform(
        test)  # DF[id, url, ... prediction, predictedClass]

    # If labels are given for the test set, print a score.s
    if test_y:
        test = test.orderBy(test.id)
        test = test.withColumn(
            'correct', (test.label == test.predictedClass).cast('double'))
        test = test.select(avg(test.correct))
        print(test.show())

    # If no labels are given for the test set, print predictions.
    else:
        test = test.orderBy(test.id).select(test.predictedClass)
        test = test.rdd.map(lambda row: int(row.predictedClass))
        test = test.toLocalIterator()
        print(*test, sep='\n')

Ejemplo n.º 5

0

Mostrar archivo

def main(spark):
    '''

    Parameters
    ----------
    spark : SparkSession object
    '''
    test_file = 'hdfs:/user/bm106/pub/project/cf_test.parquet'
    test = spark.read.parquet(test_file)
    test.createOrReplaceTempView('test')

    w = Window.partitionBy("user_id")

    def ratio_count(c, w):
        return (col(c) / count(c).over(w))


    test = test.select("user_id", "track_id", ratio_count("count", w).alias("count"))
    test.createOrReplaceTempView('test')
    print("Ratio scores done")

    train_sample = spark.read.parquet('hdfs:/user/dev241/extension4_ratio.parquet')
    train_sample.createOrReplaceTempView('train_sample')
    print("Training sample ext4 loaded")

    StringIndexer = PipelineModel.load('hdfs:/user/dev241/DieterStringIndexer')
    test_idx = StringIndexer.transform(test)
    train_idx = StringIndexer.transform(train_sample)

    #change to best
    rank = 78 
    alpha = 14.287069059772636
    reg = 0.41772043857578584

    model = ALSModel.load("Extension4_ratio")
    print('Model loaded')

    #test ranking metrics
    test_idx = test_idx.select('user_idx','track_idx','count')
    test_users = test_idx.select('user_idx').distinct()
    test_comb = test_idx.groupBy('user_idx').agg(F.collect_set('track_idx').alias('test_labels'))
    track_number = 500
    rec_test = spark.read.parquet('hdfs:/user/dev241/rec_test4.parquet')
    print('Rec test loaded.')
    join = test_comb.join(rec_test,test_comb.user_idx == rec_test.user_idx)
    print('Join done.')
    j4 = join.toDF('user_idx', 'test_labels','user_idx2','recommendations')
    j4.write.parquet("ext4join")
    print('j4 parquet written')
    predictionAndLabels = join.rdd.map(lambda r: ([track.track_idx for track in r.recommendations], r.test_labels))
    print('Map done.')
    metrics = RankingMetrics(predictionAndLabels)
    print('RM done.')
    mavgp = metrics.meanAveragePrecision
    print("Test mean Average Precision : ",mavgp)
    pass

Ejemplo n.º 6

0

Mostrar archivo

def main(spark):
    '''

    Parameters
    ----------
    spark : SparkSession object
    '''

    # File names
    test_file = 'hdfs:/user/bm106/pub/project/cf_test.parquet'
    train_sample_file = 'hdfs:/user/ah3243/extension1_count_greater_1.parquet'

    # Reading the parquet files
    test = spark.read.parquet(test_file)
    train_sample = spark.read.parquet(train_sample_file)

    # StringIndexer
    print("String Indexer entered")
    StringIndexer = PipelineModel.load('hdfs:/user/dev241/DieterStringIndexer')
    test_idx = StringIndexer.transform(test)
    train_idx = StringIndexer.transform(train_sample)
    print("String Indexer done")

    #change to best
    rank = 78
    alpha = 14.287069059772636
    reg = 0.41772043857578584

    #model
    als = ALS(rank=rank,
              alpha=alpha,
              regParam=reg,
              userCol="user_idx",
              itemCol="track_idx",
              ratingCol="count",
              coldStartStrategy="drop",
              implicitPrefs=True)
    model = als.fit(train_idx)
    print("Model fit for test done")
    model.save("Test_Model")
    print("Model save for test done")

    #test ranking metrics
    test_idx = test_idx.select('user_idx', 'track_idx', 'count')
    test_users = test_idx.select('user_idx').distinct()
    test_comb = test_idx.groupBy('user_idx').agg(
        F.collect_set('track_idx').alias('test_labels'))
    track_number = 500
    rec_test = model.recommendForUserSubset(test_users, track_number)
    join = test_comb.join(rec_test, test_comb.user_idx == rec_test.user_idx)
    predictionAndLabels = join.rdd.map(lambda r: (
        [track.track_idx for track in r.recommendations], r.test_labels))
    metrics = RankingMetrics(predictionAndLabels)
    mavgp = metrics.meanAveragePrecision
    print("Test mean Average Precision : ", mavgp)
    pass

Ejemplo n.º 7

0

Mostrar archivo

def indexData(df_sample):
    df_sche = df_sample.schema.fields
    for s in df_sche:
        n = s.name
        if (n != "target") & (n != "id"):
            print(n)
            indexer = StringIndexer(inputCol=n,
                                    outputCol=n + "_index").fit(df_sample)
            df_sample = indexer.transform(df_sample).drop(n)
        elif n == "id":
            indexer = StringIndexer(inputCol=n,
                                    outputCol=n + "_index").fit(df_sample)
            df_sample = indexer.transform(df_sample)
    return df_sample

Ejemplo n.º 8

0

Mostrar archivo

    def train_test(self, df):
        
        df = self.dropNonTCPUDP(df)

        catCols = []
        numCols = ['avg_ipt', 'bytes_in', 'bytes_out', 'entropy', 'total_entropy', 'num_pkts_out', 'num_pkts_in', 'duration']
        labelCol = 'label'

        data = self.get_dummy(df, catCols, numCols, labelCol)
        data.show()

        labelIndexer = StringIndexer(inputCol='label',
                             outputCol='indexedLabel').fit(data)

        labelIndexer.transform(data)

        featureIndexer = VectorIndexer(inputCol="features", \
                                        outputCol="indexedFeatures").fit(data)
        featureIndexer.transform(data)

        (trainingData, testData) = data.randomSplit([0.7, 0.3])
        trainingData.cache()
     #   trainingData.repartition(200)
        testData.cache()
       # testData.repartition(200)
        trainingData.show(5,False)
        testData.show(5,False)

        rf = RandomForestClassifier(featuresCol='indexedFeatures', labelCol='indexedLabel')
        gbt = GBTClassifier(featuresCol='indexedFeatures', labelCol='indexedLabel')
        logr = LogisticRegression(featuresCol='indexedFeatures', labelCol='indexedLabel')

        # Convert indexed labels back to original labels.
        labelConverter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=labelIndexer.labels)
        
        pipeline = Pipeline(stages=[labelIndexer, featureIndexer, gbt, labelConverter])
        model = pipeline.fit(trainingData)
        predictions = model.transform(testData)
        # Select example rows to display.
        predictions.select("features","label","predictedLabel", "prediction")

        # Select (prediction, true label) and compute test error
 
        print(self.getTestError(predictions))
        self.printMetrics(predictions)
      #  print(self.ExtractFeatureImp(model.stages[-2].featureImportances, testData, "features"))

        return model

Ejemplo n.º 9

0

Mostrar archivo

def get_sample_data():
    '''
  This function loads and returns the iris datatset for example purposes.
   
  Arguments: None
    
  Returns:
    data {PySpark Dataframe} -- Returns the iris dataset
  '''

    iris = datasets.load_iris()
    data1 = pd.DataFrame(data=np.c_[iris['data'], iris['target']],
                         columns=iris['feature_names'] + ['target'])
    data = spark.createDataFrame(data1)

    # vectorize all numerical columns into a single feature column
    feature_cols = data.columns[:-1]
    assembler = VectorAssembler(inputCols=feature_cols, outputCol='features')
    data = assembler.transform(data)

    # convert text labels into indices
    data = data.select(['features', 'target'])
    label_indexer = StringIndexer(inputCol='target',
                                  outputCol='label').fit(data)
    data = label_indexer.transform(data)

    # only select the features and label column
    data = data.select(['features', 'label'])

    return data

Ejemplo n.º 10

0

Mostrar archivo

Archivo: PredictiveDataTransformation.py Proyecto: sahilsingh1123/predictiveAnalysis

    def isLabelIndexed(self, schemaData, label, dataset):
        isLabelIndexed = "no"
        labelIndexedInfo = {}
        labelIndexer = None
        for schemaVal in schemaData:
            if (str(schemaVal.dataType) == "StringType"
                    and schemaVal.name == label):
                labelIndexer = StringIndexer(
                    inputCol=label,
                    outputCol=PredictiveConstants.INDEXED_ + label,
                    handleInvalid="keep").fit(dataset)
                dataset = labelIndexer.transform(dataset)
                label = PredictiveConstants.INDEXED_ + label
                isLabelIndexed = "yes"
            if (str(schemaVal.dataType) != "StringType"
                    and schemaVal.name == label):
                label = label
                isLabelIndexed = "no"

        labelIndexedInfo.update({
            PredictiveConstants.DATASET: dataset,
            PredictiveConstants.ISLABELINDEXED: isLabelIndexed,
            PredictiveConstants.LABELINDEXER: labelIndexer
        })

        return labelIndexedInfo

Ejemplo n.º 11

0

Mostrar archivo

Archivo: diff_feature_cls.py Proyecto: hongbin0908/bintrade

def run(start1, end1, start2, end2, df, sc, sql_context, is_pred):
    lp_data= get_labeled_points(start1, end2, df, sc, sql_context)
    print lp_data.count()

    labelIndexer = StringIndexer(inputCol="label", outputCol="indexedLabel").fit(lp_data)
    td = labelIndexer.transform(lp_data)
    label2index = {}
    for each in  sorted(set([(i[0], i[1]) for i in td.select(td.label, td.indexedLabel).distinct().collect()]),
                key=lambda x: x[0]):
        label2index[int(each[0])] = int(each[1])
    print label2index

    featureIndexer = \
        VectorIndexer(inputCol="features", outputCol="indexedFeatures", maxCategories=4).fit(lp_data)

    rf = get_model()

    pipeline = Pipeline(stages=[labelIndexer, featureIndexer, rf])

    lp_train = lp_data.filter(lp_data.date3<end1).filter(lp_data.is_labeled == 1)
    model = pipeline.fit(lp_train)
    lp_check = lp_data.filter(lp_data.date2>start2)
    predictions = model.transform(lp_check)
    predictions = val(predictions, label2index, sql_context)

    if is_pred:
        predictions = predictions.filter(predictions.is_labeled ==0).filter(predictions.date2 == get_cur()).sort(predictions.prob.desc())
        dfToTableWithPar(sql_context, predictions, "predictions", get_cur())
        for each in predictions.take(10):
            print each

Ejemplo n.º 12

0

Mostrar archivo

Archivo: predictor.py Proyecto: shuson/EB5001_BDEA_CA

def predict(doc):
    tweet = Row(source=doc['source'], 
                retweet_count=doc['retweet_count'], 
                favorite_count=doc['favorite_count'], is_retweet=doc['is_retweet'],
                sentiment_compound=sentiment_analyze(doc['text'], "compound"),
                sentiment_neg=sentiment_analyze(doc['text'], "neg"),
                sentiment_neu=sentiment_analyze(doc['text'], "neu"),
                sentiment_pos=sentiment_analyze(doc['text'], "pos"),
                hour=util.convertUTCtoHourOfDay(doc['created_at']),
                day=util.convertUTCtoDay(doc['created_at']),
                week=util.convertUTCtoWeekNumber(doc['created_at']),
                month=util.convertUTCtoMonth(doc['created_at']),
                year=util.convertUTCtoYear(doc['created_at'])
            )
    tweet_df = spark.createDataFrame([tweet])
    
    str_indexer = StringIndexer().setInputCol("source").setOutputCol("source_index").fit(tweet_df)
    tweet_df2 = str_indexer.transform(tweet_df)
    tweet_df3 = tweet_df2.select([col(c).cast("double").alias(c) for c in tweet_df2.columns])
    predictions = model.transform(tweet_df3)
    result = predictions.select("prediction").collect()
    
    if len(result)>0:
        return result[0]['prediction']
    
    return None

Ejemplo n.º 13

0

Mostrar archivo

 def parse_data(self, path_ratings, nrows):
     df_ratings = self.sqlContext.read.csv(path_ratings,
                                           header=True,
                                           quote='"').limit(nrows)
     # self.data.count()
     raw_to_uid = StringIndexer(inputCol="user_id",
                                outputCol="UID").fit(df_ratings)
     self.data = raw_to_uid.transform(df_ratings)
     raw_to_iid = StringIndexer(inputCol="business_id",
                                outputCol="IID").fit(df_ratings)
     self.data = raw_to_iid.transform(self.data)
     # uid and iid must be integers for spark ALS
     self.data = self.data.rdd.map(\
                 lambda r: (int(r['UID']), \
                            int(r['IID']), \
                            float(r['stars'])))\
                 .toDF(("UID", "IID", "stars"))

Ejemplo n.º 14

0

Mostrar archivo

Archivo: load_data.py Proyecto: frank-cao-csun/Restaurant-Recommendation-WebApp

def review_ids_to_number(dataframe):
    #build indexer model for user_id
    indexer_user = StringIndexer(inputCol ="user_id",outputCol="user_id_num").fit(dataframe) 
    indexer_user_save = os.path.join('model','user_ind_model')
    indexer_user.write().overwrite().save(indexer_user_save)
    #build indexer model for business_id
    indexer_business = StringIndexer(inputCol ="business_id",outputCol="business_id_num",handleInvalid="skip").fit(dataframe)
    indexer_business_save = os.path.join('model', 'bus_ind_model')
    indexer_business.write().overwrite().save(indexer_business_save)
    #transform id columns to string
    indexed = indexer_user.transform(dataframe)
    final_indexed = indexer_business.transform(indexed)
    final_indexed.show(20)
    #save fitted strtingIndexer models
    final_indexed_save = os.path.join('dataset','review_vegas_als.parquet')
    final_indexed.write.mode('overwrite').parquet(final_indexed_save)
    logger.error('Indexed dataframe for ALS traing saved to review_vegas_als.parquet')
    logger.error('{} seconds has elapsed'.format(time.time() - start_time))

Ejemplo n.º 15

0

Mostrar archivo

def ProcessData(df):
    df = df.withColumn("label", df["Cancelled"].cast(IntegerType()))
    # categoricalColumns = ['Origin','Dest']

    #Categorical to Continuous/Ordinal/assigning the index
    categoricalColumns = ['Origin', 'Dest']
    for categoricalCol in categoricalColumns:
        stringIndexer = StringIndexer(inputCol=categoricalCol,
                                      outputCol=categoricalCol +
                                      'Index').fit(df)
        df = stringIndexer.transform(df)
    #One Hot Encoder


#        encoder = OneHotEncoderEstimator(inputCols=["OriginIndex", "DestIndex"],
#                                     outputCols=["categoryVec1", "categoryVec2"])
#        model = encoder.fit(df)
#        encoded = model.transform(df)
#        for categoricalCol in categoricalColumns:
#            stringIndexer = StringIndexer(inputCol = categoricalCol, outputCol = categoricalCol + 'Index').fit(df)
#            df=stringIndexer.transform(df)
#
    df = df.withColumn("YearInt", df["Year"].cast(IntegerType()))
    df = df.withColumn("MonthInt", df["Month"].cast(IntegerType()))
    df = df.withColumn("DayofMonthInt", df["DayofMonth"].cast(IntegerType()))
    df = df.withColumn("DayofWeekInt", df["DayOfWeek"].cast(IntegerType()))
    df = df.withColumn("DepTimeInt", df["DepTime"].cast(IntegerType()))
    df = df.withColumn("CRSDepTimeInt", df["CRSDepTime"].cast(IntegerType()))
    df = df.withColumn("ArrTimeInt", df["ArrTime"].cast(IntegerType()))
    df = df.withColumn("CRSArrTimeInt", df["CRSArrTime"].cast(IntegerType()))

    df = df.withColumn("ActualElapsedTimeInt",
                       df["ActualElapsedTime"].cast(IntegerType()))
    df = df.withColumn("CRSElapsedTimeInt",
                       df["CRSElapsedTime"].cast(IntegerType()))
    df = df.withColumn("ArrDelayInt", df["ArrDelay"].cast(IntegerType()))
    df = df.withColumn("DepDelayInt", df["DepDelay"].cast(IntegerType()))
    df = df.withColumn("DistanceInt", df["Distance"].cast(IntegerType()))
    #df= df.withColumn("label", df["Cancelled"].cast(IntegerType()))
    #        encoder = OneHotEncoderEstimator(inputCols=["OriginIndex", "DestIndex"],
    #                                     outputCols=["categoryVec1", "categoryVec2"])
    #        model = encoder.fit(df)
    #        encoded = model.transform(df)
    #
    assembler = VectorAssembler(inputCols=[
        "YearInt", "MonthInt", "DayofMonthInt", "DayofWeekInt", "DepTimeInt",
        "CRSDepTimeInt", "ArrTimeInt", "CRSArrTimeInt", "ActualElapsedTimeInt",
        "CRSElapsedTimeInt", "ArrDelayInt", "DepDelayInt", "OriginIndex",
        "DestIndex", "DistanceInt"
    ],
                                outputCol="features")

    # assembler = VectorAssembler(inputCols=["YearInt","MonthInt","DayofMonthInt","DayofWeekInt","DepTimeInt","CRSDepTimeInt","ActualElapsedTimeInt","CRSElapsedTimeInt","ArrDelayInt","DepDelayInt","OriginIndex","DestIndex","DistanceInt"], outputCol="features")
    df = assembler.transform(df)
    return df
    """============================================================================================================="""

Ejemplo n.º 16

0

Mostrar archivo

def vectorizeData(df, labelsCol, weighClass=False, featsCol=None):
    """Creates dataset from spark DataFrame of mixed categorical and numerical
    features. The function returns only two columns 'label' and 'features'. The 
    input Spark dataframe is 'df'. The column name corresponding to the training 
    labels must be provided in 'labelsCol'."""
    assert labelsCol in df.columns  # 'labelsCol' is not in df.columns
    # Importantly: replace numerical values by zero and categorical values by "NONE" (string)
    df = df.fillna(0).fillna("NONE")
    stringColList = [
        i[0] for i in df.dtypes if (i[1] == 'string' and i[0] != labelsCol)
    ]
    # Indexing categorical features (string types)
    indexedCategoricalCols = [
        categoricalCol + "Index" for categoricalCol in stringColList
    ]
    stages = [
        StringIndexer(
            inputCol=categoricalCol,
            outputCol=idx_categoricalCol,
        ) for categoricalCol, idx_categoricalCol in zip(
            stringColList, indexedCategoricalCols)
    ]
    indexer = Pipeline(stages=stages)
    df = indexer.fit(df).transform(df)

    # Assembling indexed and numeric features
    numericColList = [
        i[0] for i in df.dtypes if (i[1] != 'string' and i[0] != labelsCol)
    ]
    assemblerInputs = indexedCategoricalCols + numericColList
    assembler = VectorAssembler(
        inputCols=assemblerInputs,
        outputCol="features" if featsCol is None else featsCol)
    df = assembler.transform(df)
    # Indexing binary labels
    labeller = StringIndexer(inputCol=labelsCol, outputCol="label").fit(df)
    df = labeller.transform(df).select(
        ["features" if featsCol is None else featsCol, "label"])

    if weighClass:
        from sklearn.utils.class_weight import compute_class_weight as weigh
        labels = [int(i.label) for i in df.select('label').collect()]
        wC0, wC1 = list(
            weigh(class_weight='balanced', classes=[0.0, 1.0], y=labels))
        return assemblerInputs, df.withColumn(
            'weight',
            F.when(df.label == 0.0, wC0).otherwise(wC1))
    else:
        return assemblerInputs, df

Ejemplo n.º 17

0

Mostrar archivo

def main(spark, train_data_file, rank_val, reg, alpha_val, user_indexer_model,
         item_indexer_model, model_file):
    '''
    Parameters
    ----------
    spark : SparkSession object
    data_file : string, path to the parquet file to load
    model_file : string, path to store the serialized model file
    '''

    # Load the parquet file
    train = spark.read.parquet(train_data_file)
    #val = spark.read.parquet(val_data_file)

    #transform data
    indexer_user = StringIndexer(inputCol="user_id",
                                 outputCol="user",
                                 handleInvalid="skip").fit(train)
    indexer_item = StringIndexer(inputCol="track_id",
                                 outputCol="item",
                                 handleInvalid="skip").fit(train)
    als = ALS(userCol='user',
              itemCol='item',
              implicitPrefs=True,
              ratingCol='count',
              rank=rank_val,
              regParam=reg,
              alpha=alpha_val)

    pipeline = Pipeline(stages=[indexer_user, indexer_item, als])
    train = indexer_user.transform(train)
    train = indexer_item.transform(train)
    model = als.fit(train)
    indexer_user.save(user_indexer_model)
    indexer_item.save(item_indexer_model)
    model.save(model_file)

Ejemplo n.º 18

0

Mostrar archivo

Archivo: SADecisionTreeClassifierTest.py Proyecto: sahilsingh1123/dmx_deepinsight_prediction

    def labelIndexing(self, sentimentInfoData):
        labelColm = sentimentInfoData.get(pc.LABELCOLM)
        dataset = sentimentInfoData.get(pc.DATASET)
        indexedLabel = pc.INDEXED_ + labelColm
        #check if the datatype of the col is integer or float or double. if yes then no need to do the indexing.
        '''for now converting each datatypes to string and then indexing it.'''
        dataset = dataset.withColumn(labelColm, dataset[labelColm].cast(StringType()))
        labelIndexer = StringIndexer(inputCol=labelColm, outputCol=indexedLabel,
                                     handleInvalid="keep").fit(dataset)
        dataset = labelIndexer.transform(dataset)
        #storeLabelIndexer = labelIndexer.write().overwrite().save("") # will update this later
        sentimentInfoData.update({
            pc.INDEXEDCOLM: indexedLabel,
            pc.DATASET: dataset
        })

        return sentimentInfoData

Ejemplo n.º 19

0

Mostrar archivo

def random_forest(df, seed, num_of_trees_list):
    # Drop preferred_foot because it's the only categorical column, the others are all numerical
    # Use preferred_foot if we have time to implement it
    df = df.drop("preferred_foot")

    # Create a new column for the team_position label that is numerical instead of categorical
    labelIndexer = StringIndexer(inputCol="team_position", outputCol="indexed_label").fit(df)
    df = labelIndexer.transform(df)

    list_of_features = df.drop("team_position").drop("indexed_label").columns  # Get list of all features
    assembler = VectorAssembler(inputCols=list_of_features, outputCol="indexed_features")
    df = assembler.transform(df)

    (training_data, testing_data) = df.randomSplit([0.8, 0.2], seed)  # Split the training and testing data

    accuracy_list = []
    cm_list = []  # List of confusion matrices
    for num_of_trees in num_of_trees_list:
        random_forest = RandomForestClassifier(labelCol="indexed_label", featuresCol="indexed_features", impurity="entropy", numTrees=num_of_trees, maxDepth=10)
        model = random_forest.fit(training_data)

        predictions = model.transform(testing_data)

        evaluator = MulticlassClassificationEvaluator(labelCol="indexed_label", predictionCol="prediction",
                                                      metricName="accuracy")
        accuracy = evaluator.evaluate(predictions)
        accuracy_list.append(accuracy)

        y_true = predictions.select(['indexed_label']).collect()
        y_pred = predictions.select(['prediction']).collect()
    
        print("Classification report and confusion matrix for Random Forest with " + str(num_of_trees) + " trees:")
        print(classification_report(y_true, y_pred))
        cm = confusion_matrix(y_true, y_pred)
        confusion_matrix_corrected = [[cm[1][1], cm[1][2], cm[1][0]], [cm[2][1], cm[2][2], cm[2][0]],
                                      [cm[0][1], cm[0][2], cm[0][0]]]
        print("")
        print(confusion_matrix_corrected[0])
        print(confusion_matrix_corrected[1])
        print(confusion_matrix_corrected[2])

        cm_list.append(np.array([confusion_matrix_corrected[0], confusion_matrix_corrected[1], confusion_matrix_corrected[2]]))

    return accuracy_list, cm_list

Ejemplo n.º 20

0

Mostrar archivo

Archivo: mimic_hp_training_scale.py Proyecto: dhlee4/AcuteOrganFailureInterventionModel

    def add_demo(self):
        import pyspark
        try:
            return self.spark.read.parquet(self.cur_demo_file_name).withColumnRenamed("HADM_ID", "ID")
        except pyspark.sql.utils.AnalysisException as ex:

            template = "An exception of type {0} occurred. Arguments:\n{1!r}"
            message = template.format(type(ex).__name__, ex.args)
            self.logger.info(message)
            self.logger.info("PROCESS")

            from pyspark.sql.functions import datediff,col
            from pyspark.ml.feature import OneHotEncoder, StringIndexer
            from pyspark.ml.feature import VectorAssembler
            cur_demo = self.spark.read.parquet(self.data_dir + "/ADMISSIONS").select("SUBJECT_ID", "HADM_ID", "ADMITTIME", "ADMISSION_TYPE", "ADMISSION_LOCATION", "INSURANCE", "LANGUAGE", "RELIGION", "MARITAL_STATUS", "ETHNICITY")
            cur_pts = self.spark.read.parquet(self.data_dir + "/PATIENTS").select("SUBJECT_ID", "DOB", "GENDER")
            merged_demo = cur_demo.join(cur_pts,"SUBJECT_ID").drop("SUBJECT_ID")
            merged_demo = merged_demo.withColumn("AGE",datediff("ADMITTIME","DOB")/365.0).withColumn("AGE",when(col("AGE")>90,90).otherwise(col("AGE"))).drop("ADMITTIME","DOB").where("AGE > 18").fillna("N/A")

            target_col = merged_demo.columns
            target_col.remove("AGE")
            target_col.remove("HADM_ID")
            target_col.sort()
            self.logger.debug(target_col)
            vector_target = ["AGE"]
            demo_col_list = ["AGE"]
            for cat_col in target_col:
                SI_model= StringIndexer(inputCol=cat_col, outputCol="SI_{0}".format(cat_col)).fit(merged_demo)
                demo_col_list = demo_col_list+[demo_var+"||"+demo_info for demo_var, demo_info in (zip([cat_col]*len(SI_model.labels),SI_model.labels))]
                merged_demo = SI_model.transform(merged_demo)
                merged_demo = OneHotEncoder(inputCol="SI_{0}".format(cat_col),outputCol="OH_{0}".format(cat_col), dropLast=False).transform(merged_demo)
                vector_target.append("OH_{0}".format(cat_col))

            import json
            json.dump({"demo_feature":demo_col_list},open(self.json_demo_feature_dump_loc,"w"))
            sorted(vector_target)
            self.logger.debug( vector_target)
            return_df = VectorAssembler(inputCols=vector_target,outputCol="demo_feature").transform(merged_demo)
            return_df.write.save(self.cur_demo_file_name)
            return_df = self.spark.read.parquet(self.cur_demo_file_name).withColumnRenamed("HADM_ID", "ID").select("ID","demo_feature")
            return return_df

Ejemplo n.º 21

0

Mostrar archivo

Archivo: naive_bayes.py Proyecto: darkheros12/SOEN_471_Project

def naive_bayes(df, seed):
    # Drop preferred_foot because it's the only categorical column, the others are all numerical
    # Use preferred_foot if we have time to implement it
    df = df.drop("preferred_foot")

    labelIndexer = StringIndexer(inputCol="team_position", outputCol="label").fit(df)
    df = labelIndexer.transform(df)
    df = df.drop("team_position")

    list_of_features = df.drop("label").columns  # Get list of all features
    assembler = VectorAssembler(inputCols=list_of_features, outputCol="features")
    df = assembler.transform(df)

    (train_data, test_data) = df.randomSplit([0.8, 0.2], seed)

    n_bayes = NaiveBayes(smoothing=1.0, modelType="multinomial")

    model = n_bayes.fit(train_data)  # Training happens here

    predictions = model.transform(test_data)

    evaluator = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction",
                                                  metricName="accuracy")
    accuracy = evaluator.evaluate(predictions)

    y_true = predictions.select(['label']).collect()
    y_pred = predictions.select(['prediction']).collect()

    print("Classification report and confusion matrix for Naive Bayes:")
    print(classification_report(y_true, y_pred))
    cm = confusion_matrix(y_true, y_pred)
    confusion_matrix_corrected = [[cm[1][1], cm[1][2], cm[1][0]], [cm[2][1], cm[2][2], cm[2][0]],
                                  [cm[0][1], cm[0][2], cm[0][0]]]
    print("")
    print(confusion_matrix_corrected[0])
    print(confusion_matrix_corrected[1])
    print(confusion_matrix_corrected[2])

    cm = np.array([confusion_matrix_corrected[0], confusion_matrix_corrected[1], confusion_matrix_corrected[2]])

    return accuracy, cm

Ejemplo n.º 22

0

Mostrar archivo

Archivo: mainUtilities.py Proyecto: sahilsingh1123/SentimentAnalysis

    def stringIndexer(infoData):
        colmToIndex = infoData.get(mc.COLMTOINDEX)
        dataset = infoData.get(mc.DATASET)
        indexedColm = infoData.get(mc.INDEXEDCOLM)
        storageLocation = infoData.get(mc.STORAGELOCATION)
        indexerName = colmToIndex + mc.INDEXER
        file = storageLocation + indexerName
        # check if the datatype of the col is integer or float or double. if yes then no need to do the indexing-- sahil.
        '''for now converting each datatypes to string and then indexing it.'''
        dataset = dataset.withColumn(colmToIndex, dataset[colmToIndex].cast(StringType()))
        stringIndexer = StringIndexer(inputCol=colmToIndex, outputCol=indexedColm,
                                     handleInvalid="keep").fit(dataset)
        dataset = stringIndexer.transform(dataset)
        stringIndexer.write().overwrite().save(file)  # will update this later
        indexerPathMapping = infoData.get(mc.INDEXERPATHMAPPING)
        indexerPathMapping.update({colmToIndex: file})
        infoData.update({
            mc.INDEXERPATHMAPPING: indexerPathMapping,
            mc.DATASET: dataset
        })

        return infoData

Ejemplo n.º 23

0

Mostrar archivo

Archivo: spark.py Proyecto: cmu-transparency/lib-iris

def index_nominals(dataframe, renamer=lambda string: u"indexed_%s" % string):
    """Create indexed versions of nominal features in the given dataframe."""

    all_cols = dataframe.columns

    schema = dataframe.schema
    names_by_idx = [str(name) for name in schema.names]
    types_by_idx = [field.dataType for field in schema.fields]
    labels_by_idx = [[] for idx in range(len(all_cols))]

    dataframe_indexing = dataframe

    # The new (or old if not indexed) column names.
    columns = []

    # Fit and apply a sequence of nominal feature indexer.
    for idx, col in enumerate(all_cols):
        if types_by_idx[idx] is StringType():
            # Encode nominal features into doubles.
            indexer = StringIndexer(
                inputCol=col, outputCol=renamer(col)).fit(dataframe_indexing)

            labels_by_idx[idx] = indexer.labels
            dataframe_indexing = indexer.transform(dataframe_indexing)
            columns.append(renamer(col))
        else:
            labels_by_idx[idx] = []
            columns.append(col)

    # Create the object that holds the information necessary to get
    # column and value names for the various converted features and
    # values.
    namer = ValueMapper(columns_by_idx=names_by_idx,
                        types_by_idx=types_by_idx,
                        values_by_idx=labels_by_idx)

    return dataframe_indexing, columns, namer

Ejemplo n.º 24

0

Mostrar archivo

Archivo: main_parallel.py Proyecto: BingZou/RecommenderSystem

def get_rs(args):

    from pyspark.ml.evaluation import RegressionEvaluator
    from pyspark.ml.recommendation import ALS
    from pyspark.ml.feature import StringIndexer
    from pyspark.sql import SparkSession
    import random
    import string

    def id_generator(size=6, chars=string.ascii_uppercase + string.digits):
       return ''.join(random.choice(chars) for _ in range(size))

    spark = SparkSession.builder.appName('Session_%s' %id_generator()).getOrCreate()
    df_train = spark.read.parquet("./cf_train_subsampled.parquet")
    df_val = spark.read.parquet("./cf_validation_subsampled.parquet")
    df_test = spark.read.parquet("./cf_test_subsampled.parquet")

    # train contains all user, but not all tracks
    user_indexer = StringIndexer(inputCol="user_id", outputCol="user_id_numeric").fit(df_train)
    track_indexer = StringIndexer(inputCol="track_id", outputCol="track_id_numeric").fit(df_train.union(df_val))

    df_train = user_indexer.transform(df_train)
    df_train = track_indexer.transform(df_train)
    df_val = user_indexer.transform(df_val)
    df_val = track_indexer.transform(df_val)
    df_test = user_indexer.transform(df_test)
    df_test = track_indexer.transform(df_test)

    rank,regParam,alpha = args
    als = ALS(rank=rank, maxIter=10, regParam=regParam, alpha=alpha, implicitPrefs = True,
          userCol="user_id_numeric", itemCol="track_id_numeric", ratingCol="count",
                coldStartStrategy="drop")
    #model = als.trainImplicit(df_train)
    model = als.fit(df_train)

    # Evaluate the model by computing the RMSE on the test data
    predictions = model.transform(df_val)
    evaluator = RegressionEvaluator(metricName="rmse", labelCol="count",
                                    predictionCol="prediction")
    rmse = evaluator.evaluate(predictions)
    return [rank,regParam,alpha,rmse]

Ejemplo n.º 25

0

Mostrar archivo

Archivo: H2.py Proyecto: zhangshun97/Advanced-Big-Data-Analysis

                      sep="\t",
                      inferSchema="true",
                      header="false")
test = spark.read.load("hdfs://10.190.2.112/data/test_set.txt",
                       format="csv",
                       sep="\t",
                       inferSchema="true",
                       header="false")

# only for feature transform
total = train.union(val).union(test)

# create features
indexer = StringIndexer(inputCol="_c2", outputCol="c22")
indexer = indexer.fit(total)
train = indexer.transform(train)
val = indexer.transform(val)
test = indexer.transform(test)
# create label
indexer = StringIndexer(inputCol="_c6", outputCol="label")
indexer = indexer.fit(total)
train = indexer.transform(train)
val = indexer.transform(val)
test = indexer.transform(test)
# One-hot encoder
encoder = OneHotEncoder(inputCol="c22", outputCol="c2")
train = encoder.transform(train)
val = encoder.transform(val)
test = encoder.transform(test)

# create the trainer and set its parameters

Ejemplo n.º 26

0

Mostrar archivo

Archivo: Amazon_recommendation.py Proyecto: RishikaMachina/Amazon_Product_Recommendation

df = df.select('asin','reviewerID','overall')
df.printSchema()
df.show()

#encoding ID's to fit in model
from pyspark.ml.feature import StringIndexer

a = StringIndexer(inputCol="reviewerID", outputCol="reviewerIDIndex",  handleInvalid='skip')
r = a.fit(df)
indexedDf = r.transform(df)
indexedDf.show()

asinIndexer = StringIndexer(inputCol="asin", outputCol="asinIndex",handleInvalid='skip')
a = asinIndexer.fit(df)
indexedDf = a.transform(indexedDf)
indexedDf.show()

from pyspark.sql.types import IntegerType
from pyspark.sql.functions import regexp_replace

indexedDf = indexedDf.withColumn("reviewerID", indexedDf["reviewerIDIndex"].cast(IntegerType()))
indexedDf = indexedDf.withColumn("asin",indexedDf["asinIndex"].cast(IntegerType()))
#indexedDf.show()

#indexedDf.toPandas().to_csv(indexedDf.csv, header=True, index=False)
indexedDf = indexedDf.select('asin','reviewerID','overall')

indexedDf.show()

print(indexedDf.count())

Ejemplo n.º 27

0

Mostrar archivo

# COMMAND ----------

data.show(5)

# COMMAND ----------

from pyspark.ml.feature import StringIndexer

# COMMAND ----------

indexer = StringIndexer(inputCol='Cruise_line',
                        outputCol='cruise_idx').fit(data)

# COMMAND ----------

data = indexer.transform(data)

# COMMAND ----------

data.show(5)

# COMMAND ----------

from pyspark.ml import linalg
from pyspark.ml.feature import VectorAssembler

# COMMAND ----------

vector = VectorAssembler(inputCols=[
    'Age', 'Tonnage', 'passengers', 'length', 'cabins', 'passenger_density',
    'cruise_idx'

Ejemplo n.º 28

0

Mostrar archivo

testing = testing.withColumn("Fare2", testing["Fare"].cast(DoubleType()))
testing = testing.fillna(-1, subset=["Pclass2", "SibSp2", "Parch2", "Fare2"])
testing = testing.fillna(29.67, subset=["Age2"])

# COMMAND ----------

display(training)

# COMMAND ----------

labelIndexer = StringIndexer(inputCol="Survived",
                             outputCol="indexedLabel").fit(training)

# COMMAND ----------

trainingFeatureTest = labelIndexer.transform(training)
display(trainingFeatureTest.select("Survived", "indexedLabel"))

# COMMAND ----------

featureIndexer1 = StringIndexer(inputCol="Sex",
                                outputCol="feature1").fit(training)

# COMMAND ----------

trainingFeatureTest = featureIndexer1.transform(trainingFeatureTest)
display(
    trainingFeatureTest.select("Survived", "indexedLabel", "Sex", "feature1"))

# COMMAND ----------

Ejemplo n.º 29

0

Mostrar archivo

features = ['Price', 'Date of Transfer', 'Property Type', 'Old/New', 'Town/City', 'District', 'County']
data = data.select(features)
# convert all selected string columns into integers
date_indexer = StringIndexer(inputCol='Date of Transfer', outputCol='Date_of_TransferIndexed')
date_indexer = date_indexer.fit(data)
property_type_indexer = StringIndexer(inputCol='Property Type', outputCol='Property_typeIndexed')
property_type_indexer = property_type_indexer.fit(data)
olde_new_indexer = StringIndexer(inputCol='Old/New', outputCol='Old_NewIndexed')
olde_new_indexer = olde_new_indexer.fit(data)
town_indexer = StringIndexer(inputCol='Town/City', outputCol='TownIndexed')
town_indexer = town_indexer.fit(data)
district_indexer = StringIndexer(inputCol='District', outputCol='DistrictIndexed')
district_indexer = district_indexer.fit(data)
county_indexer = StringIndexer(inputCol='County', outputCol='CountyIndexed')
county_indexer = county_indexer.fit(data)
data = date_indexer.transform(data)
data = property_type_indexer.transform(data)
data = olde_new_indexer.transform(data)
data = town_indexer.transform(data)
data = district_indexer.transform(data)
data = county_indexer.transform(data)
data.show
assembler=VectorAssembler(inputCols=['Date_of_TransferIndexed', 'CountyIndexed'],outputCol='features')
output=assembler.transform(data)
final_data=output.select('features','Price')
train_data,test_data=final_data.randomSplit([0.7,0.3])

lr=LinearRegression(labelCol='Price')
lr_model=lr.fit(train_data)

# save results

Ejemplo n.º 30

0

Mostrar archivo

Archivo: Airline Delays - Data Processing Pipeline.py Proyecto: stevendleung/Predicting_Airline_Delays

# cached join data to reduce processing
cached_join = weather_airline_joined.cache()

# perform train/test split based on year
train_set = filter_to_train(cached_join).cache()
test_set = filter_to_test(cached_join).cache()

# COMMAND ----------

# Index label
labelIndexer = StringIndexer(
    inputCol="dep_del15",
    outputCol="label").setHandleInvalid("keep").fit(train_set)

train_set = labelIndexer.transform(train_set)
test_set = labelIndexer.transform(test_set)

# Index features
categorical = [
    "month", "day_of_week", "op_unique_carrier", "Holiday",
    "PREVIOUS_FLIGHT_DELAYED_FOR_MODELS", "origin_WND_direction_angle",
    "origin_WND_type_code", "origin_CIG_ceiling_visibility_okay",
    "origin_VIS_variability", "dest_WND_direction_angle", "dest_WND_type_code",
    "dest_CIG_ceiling_visibility_okay", "dest_VIS_variability", "crs_dep_hour",
    'distance_group', 'origin_airport_id'
]

categorical_index = [i + "_Index" for i in categorical]

stringIndexer = StringIndexer(

Ejemplo n.º 31

0

Mostrar archivo

Archivo: Football_Predictor_Spark.py Proyecto: GillesVandewiele/FootballPredictor

train_feature_df = feature_df.filter(feature_df['time'] <= split_time)
test_feature_df = feature_df.filter(feature_df['time'] > split_time)

train_feature_df = train_feature_df.drop('time')
test_feature_df = test_feature_df.drop('time')

assembler = VectorAssembler(
    inputCols=list(set(train_feature_df.columns) - set(['result', 'home_name', 'away_name'])),
    outputCol="features")

train_df = assembler.transform(train_feature_df)
test_df = assembler.transform(test_feature_df)

labelIndexer = StringIndexer(inputCol="result", outputCol="indexedResult").fit(feature_df)

train_df = labelIndexer.transform(train_df)
test_df = labelIndexer.transform(test_df)

label_mapping = dict(enumerate(labelIndexer.labels()))
reverse_mapping = {}
for key in label_mapping:
    reverse_mapping[label_mapping[key]] = key


# ## Dimensionality reduction
# 
# Feature selection is not really supported yet in mllib, therefore, we just applied dim reduction using PCA

# In[509]:

pca = PCA(inputCol="features", outputCol="pca", k=15).fit(train_df)

Ejemplo n.º 32

0

Mostrar archivo

Archivo: applyModelSpark.py Proyecto: timjerman/AdLoadingMiner

def applyModel(fileName, loadModelName, outlierPercentile = 100):

    sc = SparkContext( 'local', 'pyspark')
    sqlContext = SQLContext(sc)

    #########
    # load data
    #########

    data = sc.textFile(fileName)
    #extract header and remove it
    header = data.first()
    data = data.filter(lambda x:x !=header).cache()
    header = header.split('\t')
    #parse data
    data = data.map(lambda x : x.split('\t'))

    #########
    # prepare features
    #########

    df = sqlContext.createDataFrame(data, header)
    df = (df.withColumn("ADLOADINGTIME",func.regexp_replace('ADLOADINGTIME', 'null', '0').cast('float'))
         .withColumn("TIMESTAMP",func.regexp_replace('TIMESTAMP', 'null', '0').cast('int'))
         .withColumn("GEOIP_LAT",func.regexp_replace('GEOIP_LAT', 'null', '0').cast('int'))
          .withColumn("GEOIP_LNG",func.regexp_replace('GEOIP_LNG', 'null', '0').cast('int'))
          .withColumn("HOSTWINDOWHEIGHT",func.regexp_replace('HOSTWINDOWHEIGHT', 'null', '0').cast('int'))
          .withColumn("HOSTWINDOWWIDTH",func.regexp_replace('HOSTWINDOWWIDTH', 'null', '0').cast('int'))
          .withColumn("TOPMOSTREACHABLEWINDOWHEIGHT",func.regexp_replace('TOPMOSTREACHABLEWINDOWHEIGHT', 'null', '0').cast('int'))
          .withColumn("TOPMOSTREACHABLEWINDOWWIDTH",func.regexp_replace('TOPMOSTREACHABLEWINDOWWIDTH', 'null', '0').cast('int'))
         )
    thr = np.percentile(df.select("ADLOADINGTIME").rdd.collect(), outlierPercentile)
    df = df.filter(func.col('ADLOADINGTIME') < thr)
    df = df.withColumn("TOPMOSTREACHABLEWINDOWAREA", func.col("TOPMOSTREACHABLEWINDOWHEIGHT")*func.col("TOPMOSTREACHABLEWINDOWWIDTH"))
    df = df.withColumn("INTENDENTISACTUALDEVICETYPE", (func.col("ACTUALDEVICETYPE")==func.col("INTENDEDDEVICETYPE")).cast('int'))
    df = df.withColumn("COMBINEDID", 
            func.concat(
                func.col('ACCOUNTID'), 
                func.col('CAMPAIGNID'), 
                func.col('CREATIVEID'), 
                func.col('SDK')) )

    #df = df.withColumn("COMBINEDID", func.regexp_replace("COMBINEDID", '^$', 'NA'))

    df = df.withColumn("COMBINEDEXTERNALID", 
            func.concat( 
                func.regexp_replace('EXTERNALADSERVER', 'null', ''), 
                func.regexp_replace('EXTERNALPLACEMENTID', 'null', ''), 
                func.regexp_replace('EXTERNALSITEID', 'null', ''), 
                func.regexp_replace('EXTERNALSUPPLIERID', 'null', '') ))

    #df = df.withColumn("COMBINEDEXTERNALID", func.regexp_replace("COMBINEDEXTERNALID", '^$', 'NA'))

    df = df.withColumn("PLATFORMCOMBINED", 
            func.concat( 
                func.regexp_replace('PLATFORM', 'null', ''), 
                func.regexp_replace('PLATFORMVERSION', 'null', '') ))

    #df = df.withColumn("PLATFORMCOMBINED", func.regexp_replace("PLATFORMCOMBINED", '^$', 'NA'))

    df = df.withColumn("UA_OSCOMB", 
            func.concat( 
                func.regexp_replace('UA_OS', 'null', ''), 
                func.regexp_replace('UA_OSVERSION', 'null', '') ))

    #df = df.withColumn("UA_OSCOMB", func.regexp_replace("UA_OSCOMB", '^$', 'NA'))
    df = df.withColumn("FILESJSON_SIZE", 
                func.regexp_replace('FILESJSON', '[^,\d]', '') )

    df = df.withColumn("FILESJSON_SIZE", 
                func.regexp_replace('FILESJSON_SIZE', '^,', '') )

    df = df.withColumn("FILESJSON_SIZE", 
                func.regexp_replace('FILESJSON_SIZE', ',,', ',') )

    udf = func.udf(lambda x: int(np.fromstring(x,dtype=int, sep=',').sum()), IntegerType())
    df = df.withColumn("FILESJSON_SIZE", udf("FILESJSON_SIZE"))

    print('Loaded and prapared %d entries' % df.count())

    #########
    # keep only needed features
    #########   

    features = ['ADLOADINGTIME',
     'PLACEMENTID',
     'TIMESTAMP',
     'CREATIVETYPE',
     'UA_HARDWARETYPE',
     'UA_VENDOR',
     'UA_MODEL',
     'UA_BROWSER',
     'UA_BROWSERVERSION',
     'FILESJSON',
     'ERRORSJSON',
     'TOPMOSTREACHABLEWINDOWAREA',
     'FILESJSON_SIZE',
     'COMBINEDID',
     'COMBINEDEXTERNALID',
     'PLATFORMCOMBINED',
     'UA_OSCOMB',
     'SDK',
     'EXTERNALADSERVER'
       ]

    df = df.select(features)

    #########
    # Convert categorical features to numerical
    #########   


    featuresCat = [
     'PLACEMENTID',
     'CREATIVETYPE',
     'UA_HARDWARETYPE',
     'UA_VENDOR',
     'UA_MODEL',
     'UA_BROWSER',
     'UA_BROWSERVERSION',
     'FILESJSON',
     'ERRORSJSON',
     'COMBINEDID',
     'COMBINEDEXTERNALID',
     'PLATFORMCOMBINED',
     'UA_OSCOMB',
     'SDK',
     'EXTERNALADSERVER'
       ]

    for i in range(len(featuresCat)):

        indexer = StringIndexer(inputCol=featuresCat[i], outputCol='_'+featuresCat[i]).setHandleInvalid("skip").fit(df)
        df = indexer.transform(df).drop(featuresCat[i])
        writer = indexer._call_java("write")
        writer.overwrite().save("indexer_" + featuresCat[i])    

    featuresCat = [ '_' + featuresCat[i] for i in range(len(featuresCat))]    

    features = featuresCat[:]
    features.append('TIMESTAMP')    
    features.append('FILESJSON_SIZE')
    features.append('TOPMOSTREACHABLEWINDOWAREA')


    #########
    # Assemble features
    #########   


    assembler = VectorAssembler(
        inputCols=features,
        outputCol="features")

    df = assembler.transform(df)

    #########
    # Convert to labeled point
    #########   


    lp = (df.select(func.col("ADLOADINGTIME").alias("label"), func.col("features"))
      .map(lambda row: LabeledPoint(row.label, row.features)))
    lp.cache()


    #########
    # Load trained model
    #########
    
    model = RandomForestModel.load(sc, loadModelName)
    
    print('Model loaded!')
    
    predictions = model.predict(lp.map(lambda x: x.features)).collect()
    
    return predictions