Beispiel #1
0
def extract_features(pdfs_ben, pdfs_mal, csv_name):
    feat_vecs = []
    labels = []
    file_names = []
    # Extract malicious and benign features
    pool = multiprocessing.Pool()
    for pdf, feats in pool.imap(get_features, pdfs_mal):
        if feats is not None:
            feat_vecs.append(feats)
            labels.append(1.0)
            file_names.append(pdf)

    for pdf, feats in pool.imap(get_features, pdfs_ben):
        if feats is not None:
            feat_vecs.append(feats)
            labels.append(0.0)
            file_names.append(pdf)

    # Convert the data points into numpy.array
    X = numpy.array(numpy.zeros(
        (len(feat_vecs), featureedit.FeatureDescriptor.get_feature_count())),
                    dtype=numpy.float64,
                    order='C')
    for i, v in enumerate(feat_vecs):
        X[i, :] = v
    # Write the resulting CSV file
    datasets.numpy2csv(csv_name, X, labels, file_names)
def extract_features(pdfs_ben, pdfs_mal, csv_name):
    feat_vecs = []
    labels = []
    file_names = []
    # Extract malicious and benign features
    pool = multiprocessing.Pool()
    for pdf, feats in pool.imap(get_features, pdfs_mal):
        if feats is not None:
            feat_vecs.append(feats)
            labels.append(1.0)
            file_names.append(pdf)
    
    for pdf, feats in pool.imap(get_features, pdfs_ben):
        if feats is not None:
            feat_vecs.append(feats)
            labels.append(0.0)
            file_names.append(pdf)
    
    # Convert the data points into numpy.array
    X = numpy.array(numpy.zeros((len(feat_vecs), 
                                 featureedit.FeatureDescriptor.get_feature_count())), 
                                 dtype=numpy.float64, order='C')
    for i, v in enumerate(feat_vecs):
        X[i, :] = v
    # Write the resulting CSV file
    datasets.numpy2csv(csv_name, X, labels, file_names)
Beispiel #3
0
 def fit(self, X, y):
     '''
     Trains a new random forest classifier. 
     '''
     with _R_lock:
         with tempfile.NamedTemporaryFile() as tmpfile:
             datasets.numpy2csv(tmpfile, X, y)
             tmpfile.seek(0)
             # Read in the CSV file with the training samples, omitting the second column (filename)
             robjects.r('{train} <- read.csv("{csv}", header=TRUE, colClasses={cc})'.format(train=self.traindata_Rname, csv=tmpfile.name, cc=_r_colClasses))
             # Train a random forest named myforest using 1000 decision trees with 33 variables sampled at each split
             robjects.r('{model} <- randomForest(x={train}[,-1], y={train}[,1], ntree=1000, mtry=43, importance=TRUE)'.format(model=self.model_Rname, train=self.traindata_Rname))
     self.model_trained = True
Beispiel #4
0
 def decision_function(self, X):
     '''
     Classifies novel data points using a trained model. Returns a 
     list of predictions, one per data point, giving the probability 
     of the given data point belonging to the positive class. 
     '''
     assert self.model_trained, 'Must train a model before classification'
     with _R_lock:
         with tempfile.NamedTemporaryFile() as tmpfile:
             datasets.numpy2csv(tmpfile, X, numpy.zeros((X.shape[0],)))
             tmpfile.seek(0)
             # Read in the CSV file with the samples to be classified, omitting the second column (filename)
             robjects.r('{novel} <- read.csv("{csv}", header=TRUE, colClasses={cc})'.format(novel=self.noveldata_Rname, csv=tmpfile.name, cc=_r_colClasses))
             # Classify the new data points
             robjects.r('{pred} <- predict({model}, {novel}, type="prob")'.format(pred=self.predictions_Rname, model=self.model_Rname, novel=self.noveldata_Rname))
             predictions = list(robjects.r['{pred}'.format(pred=self.predictions_Rname)])
     # The first half of predictions is for the negative class, so get rid of the second half
     predictions = predictions[len(predictions) / 2:]
     res = numpy.zeros((X.shape[0], 1))
     for r, i in zip(predictions, range(X.shape[0])):
         res[i] = r
     return res