Beispiel #1
0
def main(train_x,
         train_y,
         test_x,
         test_y=None,
         idf=False,
         ngram=1,
         base='gs',
         asm=False):
    # Load : DF[id, url, features, label?]
    # The DataFrames only have a labels column if labels are given.
    # We drop the text, since Naive Bayes doesn't use it and we already have all the tokens
    kind = 'asm' if asm else 'bytes'
    train = elizabeth.load(train_x, train_y, base=base, kind=kind).drop('text')
    test = elizabeth.load(test_x, test_y, base=base, kind=kind).drop('text')

    # convert the string labels to numeric indices
    # the handleInvalid param allows the label indexer to deal with labels that weren't seen during fitting
    label_indexer = StringIndexer(inputCol='label',
                                  outputCol='indexedLabel',
                                  handleInvalid="skip")
    label_indexer = label_indexer.fit(train)
    train = label_indexer.transform(train)
    # the test set won't always have labels
    if test_y is not None:
        test = label_indexer.transform(test)

    index_labeller = IndexToString(inputCol='prediction',
                                   outputCol='predictedClass',
                                   labels=label_indexer.labels)

    # Train the preprocessor and transform the data.
    prep = elizabeth.Preprocessor()
    prep.add(NGram(n=int(ngram)))
    prep.add(CountVectorizer())
    if idf: prep.add(IDF())
    train = prep.fit(train)
    test = prep.transform(test)

    # Naive Bayes : DF[id, url, text, features, label?, rawPrediction, probability, prediction]
    nb = NaiveBayes(labelCol='indexedLabel').fit(train)
    test = nb.transform(test)
    test = index_labeller.transform(
        test)  # DF[id, url, ... prediction, predictedClass]

    # If labels are given for the test set, print a score.s
    if test_y:
        test = test.orderBy(test.id)
        test = test.withColumn(
            'correct', (test.label == test.predictedClass).cast('double'))
        test = test.select(avg(test.correct))
        print(test.show())

    # If no labels are given for the test set, print predictions.
    else:
        test = test.orderBy(test.id).select(test.predictedClass)
        test = test.rdd.map(lambda row: int(row.predictedClass))
        test = test.toLocalIterator()
        print(*test, sep='\n')
model = NaiveBayes()
model = model.fit(train_data)

# # model evaluation

# In[ ]:

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# In[ ]:

acc_eval = MulticlassClassificationEvaluator()

# In[ ]:

test_results = model.transform(test_data)

# In[ ]:

test_results = test_results.filter(test_results['prediction'] > 0)

# In[ ]:

test_results.count()

# In[ ]:

print('F1')
acc_eval.evaluate(test_results)

# In[ ]:
Beispiel #3
0
train = splits[0]
test = splits[1]

# Creamos el modelo de Naive Bayes, lo entrenamos y realizamos la prediccion
now = datetime.datetime.now()
print(now.year, now.month, now.day, now.hour, now.minute, now.second)

nb = NaiveBayes(labelCol='attack_cat_index',
                featuresCol='features',
                predictionCol='prediction')
nb = nb.fit(train)

now = datetime.datetime.now()
print(now.year, now.month, now.day, now.hour, now.minute, now.second)

result = nb.transform(test)

#Creamos una funcion para el TPR
prediction_list = result.select("attack_cat_index", "prediction").toPandas()[[
    "attack_cat_index", "prediction"
]].values.tolist()


def truePositiveRate(list, label):
    tot_count = 0
    true_count = 0
    for a in list:
        if a[0] == label:
            tot_count = tot_count + 1
            if a[1] == label:
                true_count = true_count + 1
Beispiel #4
0
assemblerInputs = indexedCategoricalCols + numericColList
assembler = VectorAssembler(inputCols=assemblerInputs, outputCol="features")
df = assembler.transform(df)

# Indexing binary labels
labeller = StringIndexer(inputCol=label, outputCol="label").fit(df)
df = labeller.transform(df).select(["features", "label"])

### Randomly split data into training and test sets. set seed for reproducibility
(trainingData, testData) = df.randomSplit([0.7, 0.3], seed=100)

#dt = DecisionTreeClassifier(labelCol="label", featuresCol="features")
dt = LogisticRegression(regParam=0.01)
model = dt.fit(trainingData)

# Make predictions.
predictions = model.transform(testData)
evaluator = Evaluator()
# Select example rows to display.
predictions.select("prediction", "label", "features").show()
# Evaluate the learned model
print("LogRegression Test %s: %f" %
      (evaluator.getMetricName(), evaluator.evaluate(predictions)))

model = NaiveBayes(thresholds=[0.1, 1.0])
model = dt.fit(trainingData)
predictions = model.transform(testData)
predictions.select("prediction", "label", "features").show()

print("Bayes Test %s: %f" %
      (evaluator.getMetricName(), evaluator.evaluate(predictions)))