class PredictionPipeline(): def __init__(self): # load config file with open("./config/predictionconfig.yml", "r") as ymlfile: cfg = yaml.load(ymlfile, Loader=yaml.FullLoader) self.interval = cfg['interval'] self.threshold = cfg['single_threshold'] # init DataPreprocessor self.data_preprocessor = DataPreprocessor() # init PredictionMaker self.prediction_maker = PredictionMaker() self.registry = CollectorRegistry() self.pushgateway_url = os.getenv('PUSHGATEWAY_URL') def run(self): while True: start_millis = int(round(time.time() * 1000)) print("Starting pipeline...") # get data df = self.data_preprocessor.get_data() df = self.data_preprocessor.preprocess_data(df) if df.empty == False: # predict result = self.prediction_maker.make_prediction(df) end_millis = int(round(time.time() * 1000)) prediction_millis = end_millis - start_millis prediction = Prediction(result) # apply changes to K8s Cluster prediction.apply(self.threshold) # push to prometheus gateway prediction.push_to_prometheus(self.registry, self.pushgateway_url) try: g = Gauge('prediction_making_speed', 'Time in ms for making Prediction.', registry=registry) except: pass g.set(prediction_millis) push_to_gateway('{}:9091'.format(self.pushgateway_url), job='prediction-maker', registry=registry) # sleep until next interval print("Prediction took {} ms.".format(prediction_millis)) print("Going back to sleep for {} sec...".format(self.interval)) time.sleep(self.interval)
class TestDataPreprocessor(unittest.TestCase): """ unittests for DataPreprocessor """ def setUp(self): """ init DataPreprocessor """ self.data_preprocessor = DataPreprocessor() def test_preprocess_empty_data(self): """ test preprocess_data with empty df """ df = pd.DataFrame( columns=['traceid', 'sessionid', 'servicessequence', 'starttime']) df = self.data_preprocessor.preprocess_data(df) full_df = pd.DataFrame( columns=['sessionid', 'nextcluster', 'starttime']) assert_frame_equal(df, full_df) def test_preprocess_data(self): """ test preprocess_data with normal df """ df = pd.DataFrame( [['1234', '1234', 'service-1,service-2,service-1', '1234']], columns=['traceid', 'sessionid', 'servicessequence', 'starttime']) df = self.data_preprocessor.preprocess_data(df) full_df = pd.DataFrame( [[0.0, '1234', '1234', 'service-1,service-2', '1234', 0, 0, 0]], columns=[ 'index', 'traceid', 'sessionid', 'servicessequence', 'starttime', 'currentclusternumber', 'clustersequence', 'nextcluster' ]) assert_frame_equal(df.sort_index(axis=1), full_df.sort_index(axis=1), check_dtype=False, check_index_type=False)
class MechansimCreator(): ''' Class responsible for creating the mechanism for the prediction ''' def __init__(self, algorithm): ''' init data preprocessor and classifier ''' self.data_preprocessor = DataPreprocessor() if str(algorithm).lower() == 'decisiontree': self.clf = DecisionTreeClassifier() elif str(algorithm).lower() == 'randomforest': self.clf = RandomForestClassifier() def run(self): ''' create and store mechanism ''' df = self.data_preprocessor.get_data() df = self.data_preprocessor.preprocess_data(df) # nextcluster as label for ML algorithm y = df['nextcluster'].values y = y.astype('int') X_train, X_test, y_train, y_test = train_test_split(df[['currentclusternumber']].values, y) # TODO: imbalanced classes? print(df.nextcluster.value_counts()) # Classifer self.clf = self.clf.fit(X_train, y_train) y_pred = self.clf.predict(X_test) print("Accuracy of Classifier:", metrics.accuracy_score(y_test, y_pred)) ''' dot_data = StringIO() export_graphviz(clf, out_file=dot_data, filled=True, rounded=True, special_characters=True,feature_names = feature_cols) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_png('./data/images/decisiontree/tree.png') # Random Forest classifier clf=RandomForestClassifier(n_estimators=32) clf.fit(X_train,y_train) y_pred=clf.predict(X_test) print("Accuracy of RandomForest:",metrics.accuracy_score(y_test, y_pred)) i_tree = 0 for tree_in_forest in clf.estimators_: if (i_tree < 10): dot_data = StringIO() export_graphviz(tree_in_forest, out_file=dot_data, filled=True, rounded=True, special_characters=True,feature_names = feature_cols) graph = pydotplus.graph_from_dot_data(dot_data.getvalue()) graph.write_png('./data/images/foresttrees/foresttree{}.png'.format(i_tree)) i_tree += 1 ''' # save the model to db mechanism = Mechansim(self.clf, self.data_preprocessor.le, int(round(time.time() * 1000))) mechanism.save()