예제 #1
0
 def run(self):
     self.initialize()
     self.dfs = {}
     if self.logPaths is None:
         self.ingestLogs()
     for context in list(self.features.keys()) + list(self.labels.keys()):
         methodName = "_{0}".format(context)
         if not methodName in dir(self):
             Msg.abort(
                 "Missing expected class method handler for {0}".format(
                     context))
     for feature in self.features:
         self.dfs[feature] = \
             getattr(self, "_{0}".format(feature))(self.logPaths["features"][feature])
     for label in self.labels:
         self.dfs[label] = \
             getattr(self, "_{0}".format(label))(self.logPaths["labels"][label])
         labelName = self.labelMap[label]
         if self.cvssVersion == 2:
             self.dfs[label][labelName] = self.dfs[label][labelName].apply(
                 lambda x: Cvss.getSeverityV2(x))
         else:
             self.dfs[label][labelName] = self.dfs[label][labelName].apply(
                 lambda x: Cvss.getSeverityV3(x))
     for feature in self.features:
         self.saveDataFeature(feature, self.dfs[feature])
     self.saveData()
     if self.scaleDataFlag:
         self.scaleData()
     return 0
예제 #2
0
 def run(self):
     self.df = pd.read_csv(self.inputDataPath)
     wb = xlsxwriter.Workbook(self.outputDataPath)
     fmtBold = wb.add_format({'bold': True})
     wsSummary = wb.add_worksheet("Summary")
     wsData = wb.add_worksheet("Data")
     wsChart = wb.add_worksheet("Chart")
     wsSummary.set_column("A:I", 18)
     wsData.set_column("A:I", 18)
     columns = sorted(self.df.columns)
     columns.insert(0, "K")
     i = 0
     for j in range(0, len(columns)):
         wsData.write(i, j, columns[j], fmtBold)
     i = 1
     beginRow = i
     for index, row in self.df.iterrows():
         wsData.write(i, 0, i)
         for j in range(1, len(columns)):
             wsData.write(i, j, row[columns[j]])
             j += 1
         i += 1
     endRow = i
     chart = wb.add_chart({"type": "scatter"})
     categories = "=Data!$A$2:$A101"
     series1 = "=Data!$B$2:$B101"
     series2 = "=Data!$C$2:$C101"
     series3 = "=Data!$D$2:$D101"
     chart.add_series({"categories": categories, "values": series1})
     chart.add_series({"categories": categories, "values": series2})
     chart.add_series({"categories": categories, "values": series3})
     chart.set_x_axis({"Name": "KFold"})
     chart.set_y_axis({"Name": "Score"})
     wsChart.insert_chart("A1", chart)
     columns.pop(0)
     stats = ["Classifier", "Mean", "Median", "Std", "Var", "Min", "Max"]
     for j in range(0, len(stats)):
         wsSummary.write(0, j, stats[j], fmtBold)
     for i in range(0, len(columns)):
         classifier = columns[i]
         sMean = Number.asFloat(np.mean(self.df[classifier]), 3)
         sMedian = Number.asFloat(np.median(self.df[classifier]), 3)
         sStd = Number.asFloat(np.std(self.df[classifier]), 3)
         sVar = Number.asFloat(np.var(self.df[classifier]), 3)
         sMin = np.min(self.df[classifier])
         sMax = np.max(self.df[classifier])
         stats = [classifier, sMean, sMedian, sStd, sVar, sMin, sMax]
         for j in range(0, len(stats)):
             if j == 0:
                 wsSummary.write(i + 1, j, stats[j], fmtBold)
             else:
                 wsSummary.write(i + 1, j, stats[j])
     wb.close()
     Msg.raw(self.df)
예제 #3
0
 def scaleData(self):
     scaler = preprocessing.RobustScaler()
     for path in self.paths:
         dfData = pd.read_csv(path)
         labelName = dfData.columns.values[0].strip()
         joinNames = dfData.columns.values[1:3]
         featureNames = dfData.columns.values[3:]
         allNames = [labelName] + joinNames.tolist() + featureNames.tolist()
         df = pd.DataFrame(columns=allNames)
         df[labelName] = dfData[labelName].values
         df[joinNames] = dfData[joinNames].values
         df[featureNames] = scaler.fit_transform(dfData[featureNames])
         Msg.show("Rescaling {0}".format(path))
         df.to_csv(path, index=False, float_format='%.2f')
예제 #4
0
 def build(self, inputDataPath):
     Msg.raw("Analyzing <-> {0}".format(inputDataPath))
     self.ingestData(inputDataPath)
     kf = KFold(n_splits=self.kFolds,
                shuffle=self.randomizeDataFlag,
                random_state=None)
     X = self.dfFeatures.values
     y = self.dfLabel.values.astype(dtype="int64")
     scoreKeeper = {}
     for name, classifier in sorted(list(self.classifiers.items())):
         scores = cross_val_score(classifier, X, y, cv=kf)
         if not name in scoreKeeper:
             scoreKeeper[name] = []
         scoreKeeper[name].extend(scores)
     return scoreKeeper
예제 #5
0
 def saveDataFeature(self, featureName, featureDf):
     joinCols = ["sessionId", "projectName"]
     featureCols = self._prepareColumns(featureDf.columns.values.tolist(),
                                        self.aliases[featureName], joinCols)
     featureDf.columns = featureCols
     for labelName in self.labels:
         path = "{0}/{1}-{2}.csv".format(self.outputDir, labelName,
                                         featureName)
         labelDf = self.dfs[labelName].copy(deep=True)
         labelCols = self._prepareColumns(labelDf.columns.values.tolist(),
                                          self.aliases[labelName], joinCols)
         labelDf.columns = labelCols
         df = labelDf.merge(featureDf, how="inner", on=joinCols)
         Msg.show("Saving {0}/{1} -> {2}".format(labelName, featureName,
                                                 path))
         self.paths.append(path)
         df.to_csv(path, index=False, float_format='%.2f')
예제 #6
0
 def initialize(self):
     Msg.show("Initializing")
     Msg.show("CVSS scoring version: {0}".format(self.cvssVersion))
     Msg.show("Scale data: {0}".format(self.scaleDataFlag))
     cfg = File.read(path=self.cfgPath, asJsonFlag=True)
     self.features = cfg["features"]
     self.labels = cfg["labels"]
     self.aliases = cfg["aliases"]
     self.labelMap = {}
     for label in self.labels:
         self.labelMap[label] = self.labels[label][0]
     Dir.make(self.outputDir)
     self.paths = []
     if self.excludeMissingDataFlag:
         Msg.show("Excluding rows with missing data")
     else:
         Msg.show("Replacing missing data using '{0}' method".format(
             self.replaceMissingDataMethod))
예제 #7
0
 def ingestLogData(self, logPaths, name, attributes):
     Msg.show("Ingesting {0} logs".format(name))
     if not "projectName" in attributes:
         attributes.insert(0, "projectName")
     if not "sessionId" in attributes:
         attributes.insert(0, "sessionId")
     samples = []
     for logPath in logPaths:
         results = File.read(path=logPath, asJsonFlag=True)
         samples.append(Data.getDictByKeys(results, attributes, None))
     df = pd.DataFrame(samples)
     if self.excludeMissingDataFlag:
         df.dropna(inplace=True)
     else:
         if self.replaceMissingDataMethod == "median":
             df.fillna(df.median(), inplace=True)
         else:
             df.fillna(df.mean(), inplace=True)
     alias = self.aliases[name]
     return df
예제 #8
0
 def run(self):
     self.initialize()
     scoreKeeper = Data.buildDictByKeys(self.classifiers.keys(), [])
     for i in range(0, self.cycles):
         Msg.show("Cycle {0} of {1}".format(i + 1, self.cycles))
         scores = self.build(self.inputDataPath)
         for classifier in scores.keys():
             scoreKeeper[classifier].extend(scores[classifier])
     df = pd.DataFrame.from_dict(scoreKeeper)
     Msg.raw(df)
     df.to_csv(self.outputDataPath, index=False, float_format='%.2f')
     Msg.show("Saved results -> {0}".format(self.outputDataPath))
예제 #9
0
 def initialize(self):
     Msg.show("Initializing")
     Dir.make(File.getDirectory(self.outputDataPath))