Beispiel #1
0
    def save_load_snapshot(self, max_clusters):
        persistence = MemoryBufferPersistence()

        config = TemplateMinerConfig()
        config.drain_max_clusters = max_clusters
        template_miner1 = TemplateMiner(persistence, config)
        print(template_miner1.add_log_message("hello"))
        print(template_miner1.add_log_message("hello ABC"))
        print(template_miner1.add_log_message("hello BCD"))
        print(template_miner1.add_log_message("hello XYZ"))
        print(template_miner1.add_log_message("goodbye XYZ"))

        template_miner2 = TemplateMiner(persistence, config)

        self.assertListEqual(list(template_miner1.drain.id_to_cluster.keys()),
                             list(template_miner2.drain.id_to_cluster.keys()))

        self.assertListEqual(
            list(template_miner1.drain.root_node.key_to_child_node.keys()),
            list(template_miner2.drain.root_node.key_to_child_node.keys()))

        def get_tree_lines(template_miner):
            sio = io.StringIO()
            template_miner.drain.print_tree(sio)
            sio.seek(0)
            return sio.readlines()

        self.assertListEqual(get_tree_lines(template_miner1),
                             get_tree_lines(template_miner2))

        print(template_miner2.add_log_message("hello yyy"))
        print(template_miner2.add_log_message("goodbye ABC"))
 def __init__(self, logs: pd.DataFrame):
     self.logs = logs
     self.template_miner = TemplateMiner()
     self.cleaned_logs = pd.DataFrame
     self.clusters = {}
     self.results = {}
     self.n_clusters = 0
Beispiel #3
0
    def __init__(self, prefix_file, model_name, num_candidates, window_size,
                 device, lr, lr_step, lr_decay_ratio, max_iter):
        Path("data").mkdir(parents=True, exist_ok=True)
        self.persistence_path = prefix_file + "_templates_persist.bin"
        persistence = FilePersistence(self.persistence_path)
        config = TemplateMinerConfig()
        config.load("ailoganalyzer/drain3.ini")
        config.profiling_enabled = False
        self.template_miner = TemplateMiner(persistence, config)
        if device == "auto":
            device = torch.device(
                "cuda" if torch.cuda.is_available() else "cpu")

        super().__init__(model_name)

        self.prefix_file = prefix_file
        self.num_candidates = num_candidates
        self.window_size = window_size
        self.device = device
        self.lr = lr
        self.lr_step = lr_step
        self.lr_decay_ratio = lr_decay_ratio
        self.nb_epoch = max_iter

        self.semantic = False
        self.sequentials = False
        self.quantitatives = False

        self.model = None

        self.sequence = []
        self.train_seq = []
        self.train_loader = None
        self.valid_loader = None
        self.model_path = self.prefix_file + "_last.pth"
def parse_file_drain3(data: DefaultDict) -> Dict:
    template_miner = TemplateMiner()

    cluster_ids = defaultdict(list)
    log_lines = defaultdict(list)
    for block_id, logs in data.items():
        for log in logs:
            line = log.rstrip().partition(': ')[
                2]  # produces tuple (pre, delimiter, post)
            result = template_miner.add_log_message(line)
            cluster_ids[block_id].append(result['cluster_id'])
            log_lines[block_id].append(line)

    log_structure = get_log_structure(log_lines, cluster_ids,
                                      template_miner.drain.clusters)
    return log_structure
    def test_get_param_list(self):
        config = TemplateMinerConfig()
        mi = MaskingInstruction(
            "((?<=[^A-Za-z0-9])|^)([\\-\\+]?\\d+)((?=[^A-Za-z0-9])|$)", "NUM")
        config.masking_instructions.append(mi)
        config.mask_prefix = "[:"
        config.mask_suffix = ":]"
        template_miner = TemplateMiner(None, config)

        def add_and_test(msg, expected_params):
            print(f"msg: {msg}")
            res = template_miner.add_log_message(msg)
            print(f"result: {res}")
            params = template_miner.get_parameter_list(res["template_mined"],
                                                       msg)
            print(f"params: {params}")
            self.assertListEqual(params, expected_params)

        add_and_test("hello", [])
        add_and_test("hello ABC", [])
        add_and_test("hello BCD", ["BCD"])
        add_and_test("request took 123 ms", ["123"])
        add_and_test("file saved [test.xml]", [])
        add_and_test("new order received: [:xyz:]", [])
        add_and_test("order type: new, order priority:3", ["3"])
        add_and_test("order type: changed, order priority:5",
                     ["changed,", "5"])
Beispiel #6
0
    def test_extract_parameters(self):
        config = TemplateMinerConfig()
        mi = MaskingInstruction(
            "((?<=[^A-Za-z0-9])|^)([\\-\\+]?\\d+)((?=[^A-Za-z0-9])|$)", "NUM")
        config.masking_instructions.append(mi)
        mi = MaskingInstruction(r"multiple words", "WORDS")
        config.masking_instructions.append(mi)
        config.mask_prefix = "[:"
        config.mask_suffix = ":]"
        template_miner = TemplateMiner(None, config)

        def add_and_test(msg, expected_params, exact_matching=False):
            print(f"msg: {msg}")
            res = template_miner.add_log_message(msg)
            print(f"result: {res}")
            extracted_parameters = template_miner.extract_parameters(
                res["template_mined"], msg, exact_matching=exact_matching)
            self.assertIsNotNone(extracted_parameters)
            params = [parameter.value for parameter in extracted_parameters]
            print(f"params: {params}")
            self.assertListEqual(params, expected_params)

        add_and_test("hello", [])
        add_and_test("hello ABC", [])
        add_and_test("hello BCD", ["BCD"])
        add_and_test("hello    BCD", ["BCD"])
        add_and_test("hello\tBCD", ["BCD"])
        add_and_test("request took 123 ms", ["123"])
        add_and_test("file saved [test.xml]", [])
        add_and_test("new order received: [:xyz:]", [])
        add_and_test("order type: new, order priority:3", ["3"])
        add_and_test("order type: changed, order priority:5",
                     ["changed,", "5"])
        add_and_test("sometimes one needs multiple words", ["multiple words"],
                     True)
        add_and_test("sometimes one needs not", ["not"], True)
        add_and_test("sometimes one needs multiple words", ["multiple words"],
                     True)
Beispiel #7
0
    def test_match_strategies(self):
        miner = TemplateMiner()
        print(miner.add_log_message("training4Model start"))
        print(miner.add_log_message("loadModel start"))
        print(miner.add_log_message("loadModel stop"))
        print(miner.add_log_message("this is a test"))
        miner.drain.print_tree()
        self.assertIsNotNone(
            miner.match("loadModel start", full_search_strategy="fallback"))
        self.assertIsNotNone(
            miner.match("loadModel start", full_search_strategy="always"))
        self.assertIsNone(
            miner.match("loadModel start", full_search_strategy="never"))
        print(miner.add_log_message("loadModel start"))
        self.assertIsNotNone(
            miner.match("loadModel start", full_search_strategy="fallback"))
        self.assertIsNotNone(
            miner.match("loadModel start", full_search_strategy="always"))
        self.assertIsNotNone(
            miner.match("loadModel start", full_search_strategy="never"))

        config = TemplateMinerConfig()
        config.parametrize_numeric_tokens = False
        miner = TemplateMiner(config=config)
        print(miner.add_log_message("training4Model start"))
        print(miner.add_log_message("loadModel start"))
        print(miner.add_log_message("loadModel stop"))
        print(miner.add_log_message("this is a test"))
        self.assertIsNotNone(
            miner.match("loadModel start", full_search_strategy="fallback"))
        self.assertIsNotNone(
            miner.match("loadModel start", full_search_strategy="always"))
        self.assertIsNotNone(
            miner.match("loadModel start", full_search_strategy="never"))

        self.assertIsNone(miner.match("", full_search_strategy="never"))
        self.assertIsNone(miner.match("", full_search_strategy="always"))
        self.assertIsNone(miner.match("", full_search_strategy="fallback"))

        print(miner.add_log_message(""))
        self.assertIsNotNone(miner.match("", full_search_strategy="never"))
        self.assertIsNotNone(miner.match("", full_search_strategy="always"))
        self.assertIsNotNone(miner.match("", full_search_strategy="fallback"))
Beispiel #8
0
    def test_match_only(self):
        config = TemplateMinerConfig()
        config.drain_extra_delimiters = ["_"]
        mi = MaskingInstruction(
            "((?<=[^A-Za-z0-9])|^)([\\-\\+]?\\d+)((?=[^A-Za-z0-9])|$)", "NUM")
        config.masking_instructions.append(mi)
        tm = TemplateMiner(None, config)

        res = tm.add_log_message("aa aa aa")
        print(res)

        res = tm.add_log_message("aa aa bb")
        print(res)

        res = tm.add_log_message("xx yy zz")
        print(res)

        res = tm.add_log_message("rrr qqq 123")
        print(res)

        c = tm.match("aa   aa tt")
        self.assertEqual(1, c.cluster_id)

        c = tm.match("aa aa 12")
        self.assertEqual(1, c.cluster_id)

        c = tm.match("xx yy   zz")
        self.assertEqual(2, c.cluster_id)

        c = tm.match("xx yy rr")
        self.assertIsNone(c)

        c = tm.match("nothing")
        self.assertIsNone(c)

        c = tm.match("rrr qqq   456   ")
        self.assertEqual(3, c.cluster_id)

        c = tm.match("rrr qqq 555.2")
        self.assertIsNone(c)

        c = tm.match("rrr qqq num")
        self.assertIsNone(c)
Beispiel #9
0
    def test_extract_parameters_direct(self):
        config = TemplateMinerConfig()
        mi = MaskingInstruction(r"hdfs://[\w.:@-]*((/[\w.~%+-]+)+/?)?",
                                "hdfs_uri")
        config.masking_instructions.append(mi)
        mi = MaskingInstruction(r"(?P<quote>[\"'`]).*?(?P=quote)",
                                "quoted_string")
        config.masking_instructions.append(mi)
        mi = MaskingInstruction(r"((?P<p_0>[*_])\2{0,2}).*?\1",
                                "markdown_emph")
        config.masking_instructions.append(mi)
        mi = MaskingInstruction(r"multiple \*word\* pattern", "*words*")
        config.masking_instructions.append(mi)
        mi = MaskingInstruction(r"some \S+ \S+ pattern", "*words*")
        config.masking_instructions.append(mi)
        mi = MaskingInstruction(r"(\d{1,3}\.){3}\d{1,3}", "ip")
        config.masking_instructions.append(mi)
        mi = MaskingInstruction(r"(?P<number>\d+)\.\d+", "float")
        config.masking_instructions.append(mi)
        mi = MaskingInstruction(r"0[xX][a-fA-F0-9]+", "integer")
        config.masking_instructions.append(mi)
        mi = MaskingInstruction(r"(?P<number>\d+)", "integer")
        config.masking_instructions.append(mi)
        mi = MaskingInstruction(r"HelloWorld", "*")
        config.masking_instructions.append(mi)
        mi = MaskingInstruction(r"MaskPrefix", "<")
        config.masking_instructions.append(mi)
        template_miner = TemplateMiner(None, config)

        test_vectors = [
            ("<hdfs_uri>:<integer>+<integer>",
             "hdfs://*****:*****@<integer>", "some other cool pattern@0xe1f",
             ["some other cool pattern", "0xe1f"], ["*words*", "integer"]),
            ("Another test with <*words*> that includes <integer><integer> and <integer> <*> <integer>",
             "Another test with some other 0Xadded pattern that includes 500xc0ffee and 0X4 times 5",
             [
                 "some other 0Xadded pattern", "50", "0xc0ffee", "0X4",
                 "times", "5"
             ], ["*words*", "integer", "integer", "integer", "*", "integer"]),
            ("some <*words*> <*words*>",
             "some multiple *word* pattern some confusing *word* pattern",
             ["multiple *word* pattern",
              "some confusing *word* pattern"], ["*words*", "*words*"]),
            ("<*words*> <*>", "multiple *word* pattern <*words*>",
             ["multiple *word* pattern", "<*words*>"], ["*words*", "*"]),
            ("<*> <*>", "HelloWorld Test", ["HelloWorld", "Test"], ["*", "*"]),
            ("<*> <*>", "HelloWorld <anything>", ["HelloWorld",
                                                  "<anything>"], ["*", "*"]),
            ("<*><integer>", "HelloWorld1", ["HelloWorld",
                                             "1"], ["*", "integer"]),
            ("<*> works <*>", "This works as-expected",
             ["This", "as-expected"], ["*", "*"]),
            ("<memory:<integer>>", "<memory:8>", ["8"], ["integer"]),
            ("<memory:<integer> <core:<float>>>", "<memory:8 <core:0.5>>",
             ["8", "0.5"], ["integer", "float"]),
            ("<*> <memory:<<integer> <core:<float>>>",
             "New: <memory:<8 <core:0.5>>", ["New:", "8",
                                             "0.5"], ["*", "integer",
                                                      "float"]),
            ("<<>", "MaskPrefix", ["MaskPrefix"], ["<"]),
            ("<<<>>", "<MaskPrefix>", ["MaskPrefix"], ["<"]),
            ("There are no parameters here.", "There are no parameters here.",
             [], []),
            ("<float> <float>", "0.15 10.16 3.19", None, None),
            ("<float> <float>", "0.15 10.16 test 3.19", None, None),
            ("<memory:<<integer> <core:<float>>>", "<memory:8 <core:0.5>>",
             None, None),
            ("<<>", "<<>", None, None),
            ("<*words*> <*words*>", "0.15 0.15", None, None),
        ]

        for template, content, expected_parameters, expected_mask_names in test_vectors:
            with self.subTest(template=template,
                              content=content,
                              expected_parameters=expected_parameters):
                extracted_parameters = template_miner.extract_parameters(
                    template, content, exact_matching=True)
                if expected_parameters is None:
                    self.assertIsNone(extracted_parameters)
                else:
                    self.assertIsNotNone(extracted_parameters)
                    self.assertListEqual([
                        parameter.value for parameter in extracted_parameters
                    ], expected_parameters)
                    self.assertListEqual([
                        parameter.mask_name
                        for parameter in extracted_parameters
                    ], expected_mask_names)
Beispiel #10
0
elif persistence_type == "REDIS":
    from drain3.redis_persistence import RedisPersistence

    persistence = RedisPersistence(redis_host='',
                                   redis_port=25061,
                                   redis_db=0,
                                   redis_pass='',
                                   is_ssl=True,
                                   redis_key="drain3_state_key")
else:
    persistence = None

config = TemplateMinerConfig()
config.load(dirname(__file__) + "/drain3.ini")

template_miner = TemplateMiner(persistence, config)
print(f"Drain3 started with '{persistence_type}' persistence")
print(f"reading from std-in (input 'q' to finish)")
while True:
    log_line = input("> ")
    if log_line == 'q':
        break
    result = template_miner.add_log_message(log_line)
    result_json = json.dumps(result)
    print(result_json)
    params = template_miner.get_parameter_list(result["template_mined"],
                                               log_line)
    print("parameters: " + str(params))

print("Clusters:")
for cluster in template_miner.drain.clusters:
Beispiel #11
0
# persistence_type = "NONE"
# persistence_type = "KAFKA"
persistence_type = "FILE"
config = configparser.ConfigParser()
config.read('drain3.ini')
logger = logging.getLogger(__name__)
logging.basicConfig(stream=sys.stdout, level=logging.INFO, format='%(message)s')

if persistence_type == "KAFKA":
    persistence = KafkaPersistence("localhost:9092", "drain3_state")
elif persistence_type == "FILE":
    persistence = FilePersistence("results/{}/drain3_state[{}].bin".format(log_type,log_type))
else:
    persistence = None

template_miner = TemplateMiner(persistence)
print(f"Drain3 started with '{persistence_type}' persistence")

df = pd.read_csv("/container/drain3/parser/results/{}/{}_struct.csv".format(log_type,log_file))#
content=df.loc[:,'Content']

for idx in content.index:
        #component=line['Component']
        #level=line['Level']
        result=template_miner.add_log_message(content[idx])
        result_json = json.dumps(result)
        #print(result_json)
        
        
print("Clusters:")
for cluster in template_miner.drain.clusters:
Beispiel #12
0
    from drain3.redis_persistence import RedisPersistence

    persistence = RedisPersistence(redis_host='',
                                   redis_port=25061,
                                   redis_db=0,
                                   redis_pass='',
                                   is_ssl=True,
                                   redis_key="drain3_state_key")
else:
    persistence = None

config = TemplateMinerConfig()
config.load(dirname(__file__) + "/drain3.ini")
config.profiling_enabled = False

template_miner = TemplateMiner(persistence, config)
print(f"Drain3 started with '{persistence_type}' persistence")
print(f"{len(config.masking_instructions)} masking instructions are in use")
print(f"Starting training mode. Reading from std-in ('q' to finish)")
while True:
    log_line = input("> ")
    if log_line == 'q':
        break
    result = template_miner.add_log_message(log_line)
    result_json = json.dumps(result)
    print(result_json)
    template = result["template_mined"]
    params = template_miner.extract_parameters(template, log_line)
    print("Parameters: " + str(params))

print("Training done. Mined clusters:")
Beispiel #13
0
    def evaluate_HDFS(self, train=True):
        config = TemplateMinerConfig()
        config.load("ailoganalyzer/drain3.ini")
        config.profiling_enabled = False
        self.template_miner = TemplateMiner(config=config)

        hdfs_log = "../../Documents/HDFS_1/HDFS.log"
        hdfs_anomaly_label = "../../Documents/HDFS_1/anomaly_label.csv"
        nb_block = 30000

        with open(hdfs_anomaly_label, "r") as f:
            hdfs_labels = {}
            for i, line in tqdm(enumerate(f), total=nb_block):
                label = line.strip().split(",")
                hdfs_labels[label[0]] = (label[1] == "Anomaly")
        keys = random.sample(list(hdfs_labels), nb_block)
        values = [hdfs_labels[k] for k in keys]
        hdfs_labels = dict(zip(keys, values))

        blk_finder_2 = re.compile(r"(blk_-?\d+)")
        with open(hdfs_log, "r") as f:
            data_dict = {key: [] for key in hdfs_labels.keys()}
            for line in tqdm(f):
                blk = re.search(blk_finder_2, line).group()
                if blk in data_dict:
                    msg = " ".join(line.strip().split()[5:])
                    result = self.template_miner.add_log_message(msg)
                    cluster_id = result["cluster_id"] - 1
                    data_dict[blk].append(cluster_id)

        abnormal = []
        normal = []
        abnormal_label = []
        normal_label = []
        abnormal_blk = []

        for blk, seq in data_dict.items():
            if len(seq) > self.window_size:
                labels = seq[self.window_size:]
                seqs = sliding_window_view(seq[:-1], self.window_size)
                if hdfs_labels[blk]:
                    abnormal.append(seqs)
                    abnormal_label.append(labels)
                    abnormal_blk.append(blk)
                else:
                    normal.append(seqs)
                    normal_label.append(labels)

        print("normal : ", len(normal))
        print("abnormal : ", len(abnormal))
        train_seq, test_seq, train_label, test_label = train_test_split(
            normal, normal_label, train_size=0.8)
        train_seq = np.concatenate(train_seq)
        train_label = np.concatenate(train_label)

        if train:
            self.set_dataLoader_training_1(train_seq, train_label)
            self.train()

        # predict

        FP = 0
        TP = 0
        mem = {}
        for seqs, labels in tqdm(zip(test_seq, test_label),
                                 total=len(test_seq)):
            for seq, label in zip(seqs, labels):
                seq_tuple = tuple(seq + [label])
                if seq_tuple in mem:
                    result = mem[seq_tuple]
                else:
                    result = self.predict_seq(seq, label)
                    mem[seq_tuple] = result
                if result:
                    FP += 1
                    break
        for seqs, labels in tqdm(zip(abnormal, abnormal_label),
                                 total=len(abnormal)):
            for seq, label in zip(seqs, labels):
                seq_tuple = tuple(seq + [label])
                if seq_tuple in mem:
                    result = mem[seq_tuple]
                else:
                    result = self.predict_seq(seq, label)
                    mem[seq_tuple] = result
                if result:
                    TP += 1
                    break
        FN = len(abnormal) - TP
        P = 100 * TP / (TP + FP)
        R = 100 * TP / (TP + FN)
        F1 = 2 * P * R / (P + R)
        print('''false positive (FP): {}, false negative (FN): {},
            Precision: {:.3f}%, Recall: {:.3f}%,
            F1-measure: {:.3f}%'''.format(FP, FN, P, R, F1))
Beispiel #14
0
class LSTMLogSequence(AnomalyDetector):
    """An abstract class for implementing anomaly detection models.

    ...

    Attributes
    ----------
    prefix_file : str
        the string wich will be added at the beginning the persistent file
        of drain3, and the .path file of the model
    num_candidates : int
        for prediction phase : the number of possible candidate for a log.
        The lower the value, the sensible the detection
    window_size : int
        the window size to use for the LSTM model
    device : {'cpu', 'cuda', 'auto'}
        the device to be used to train the model and predict.
        'cpu' will work everytime. To use 'cuda' you need to have a compatible
        graphic card, and a proper installation of CUDA. 'auto' will use cuda
        if is_available, else it will use cpu.
    lr : int
        learning rate for training.

    Methods
    -------
    add_train_log(log)
        add a log that will be used the next time train() will be called.
        The logs have to be added in the correct order.
    predict(log)
        return True if the log is abnormal, False otherwise
    train()
        train the model with the data added via the add_train_log function

    """
    def __init__(self, prefix_file, model_name, num_candidates, window_size,
                 device, lr, lr_step, lr_decay_ratio, max_iter):
        Path("data").mkdir(parents=True, exist_ok=True)
        self.persistence_path = prefix_file + "_templates_persist.bin"
        persistence = FilePersistence(self.persistence_path)
        config = TemplateMinerConfig()
        config.load("ailoganalyzer/drain3.ini")
        config.profiling_enabled = False
        self.template_miner = TemplateMiner(persistence, config)
        if device == "auto":
            device = torch.device(
                "cuda" if torch.cuda.is_available() else "cpu")

        super().__init__(model_name)

        self.prefix_file = prefix_file
        self.num_candidates = num_candidates
        self.window_size = window_size
        self.device = device
        self.lr = lr
        self.lr_step = lr_step
        self.lr_decay_ratio = lr_decay_ratio
        self.nb_epoch = max_iter

        self.semantic = False
        self.sequentials = False
        self.quantitatives = False

        self.model = None

        self.sequence = []
        self.train_seq = []
        self.train_loader = None
        self.valid_loader = None
        self.model_path = self.prefix_file + "_last.pth"

    def add_train_log(self, log):
        cluster_id = self.log_to_key(log)
        self.train_seq.append(cluster_id)

    def predict(self, log):
        cluster_id = self.log_to_key(log)
        if len(self.train_seq) > 0:
            self.train_seq = []

        self.sequence = np.array(self.sequence)
        label = np.array([cluster_id])
        if len(self.sequence) == self.window_size:
            res = self.predict_seq(self.sequence, label)
        else:
            res = False

        if len(self.sequence) == self.window_size:
            self.sequence = self.sequence[1:]
        self.sequence = np.append(self.sequence, cluster_id)
        return res

    def initialize_model(self):
        state = None
        if os.path.isfile(self.model_path):
            state = torch.load(self.model_path)
            num_classes = state["num_keys"]
            self.is_trained = True
        else:
            num_classes = self.get_number_classes()
            self.is_trained = False
        self.num_classes = num_classes

        if self.model_name == "loganomaly":
            self.model = loganomaly(hidden_size=128,
                                    num_layers=2,
                                    num_keys=num_classes)
            self.input_size = 300
            self.semantic = True
            self.quantitatives = True
            self.batch_size = 256

        elif self.model_name == "deeplog":
            self.model = deeplog(hidden_size=64,
                                 num_layers=2,
                                 num_keys=num_classes)
            self.input_size = 1
            self.sequentials = True
            self.batch_size = 2048

        elif self.model_name == "robustlog":
            raise NotImplementedError

        else:
            raise NotImplementedError

        if state is not None:
            self.model.load_state_dict(state["state_dict"])

    def train(self):
        if len(self.train_seq) < self.window_size:
            raise RuntimeError(
                "There is not enought data for training. Add logs with the add_train_log function."
            )
        if self.train_loader is None or self.valid_loader is None:
            self.set_dataLoader_training()

        print("num classes:", self.num_classes)
        trainer = Trainer(self.model,
                          self.train_loader,
                          self.valid_loader,
                          self.num_classes,
                          self.prefix_file,
                          self.model_name,
                          self.window_size,
                          max_epoch=self.nb_epoch,
                          lr_step=self.lr_step,
                          model_path=self.model_path,
                          device=self.device)
        trainer.start_train()
        self.is_trained = True

    def set_dataLoader_training(self):
        self.train_seq = np.array(self.train_seq)
        labels = self.train_seq[self.window_size:]
        sequences = sliding_window_view(self.train_seq[:-1], self.window_size)
        self.set_dataLoader_training_1(sequences, labels)

    def set_dataLoader_training_1(self, sequences, labels):
        self.initialize_model()

        train_seq, val_seq, train_label, val_label = train_test_split(
            sequences, labels, train_size=0.8)
        print("number train sequences :", len(train_seq))
        print("number val sequences :", len(val_seq))
        self.num_classes = self.get_number_classes()
        event2vec = self.template_to_vec_all()

        train_dataset = sliddingWindowDataset(train_seq,
                                              train_label,
                                              self.window_size,
                                              event2vec,
                                              num_classes=self.num_classes,
                                              seq=self.sequentials,
                                              quan=self.quantitatives,
                                              sem=self.semantic)
        valid_dataset = sliddingWindowDataset(val_seq,
                                              val_label,
                                              self.window_size,
                                              event2vec,
                                              num_classes=self.num_classes,
                                              seq=self.sequentials,
                                              quan=self.quantitatives,
                                              sem=self.semantic)

        self.train_loader = DataLoader(train_dataset,
                                       batch_size=self.batch_size,
                                       shuffle=True,
                                       pin_memory=True)
        self.valid_loader = DataLoader(valid_dataset,
                                       batch_size=self.batch_size,
                                       shuffle=False,
                                       pin_memory=True)

    def predict_seq(self, sequence, label):
        if not self.is_trained:
            raise RuntimeError("You need to train the model before predicting")
        sequence = sequence[np.newaxis]
        event2vec = self.template_to_vec_all()
        if self.model is None:
            self.initialize_model()
        self.model = self.model.eval().to(self.device)

        label = np.array([label])
        dataset = sliddingWindowDataset(sequence,
                                        label,
                                        self.window_size,
                                        event2vec,
                                        num_classes=self.num_classes,
                                        seq=self.sequentials,
                                        quan=self.quantitatives,
                                        sem=self.semantic)

        data, label = dataset[0]
        features = []
        for value in data.values():
            features.append(value[np.newaxis].to(self.device))

        label = torch.tensor(label).view(-1).to(self.device)
        output = self.model(features=features, device=self.device)
        predicted = torch.argsort(output, 1)[0][-self.num_candidates:]

        if label not in predicted:
            return True
        else:
            return False

    def evaluate_HDFS(self, train=True):
        config = TemplateMinerConfig()
        config.load("ailoganalyzer/drain3.ini")
        config.profiling_enabled = False
        self.template_miner = TemplateMiner(config=config)

        hdfs_log = "../../Documents/HDFS_1/HDFS.log"
        hdfs_anomaly_label = "../../Documents/HDFS_1/anomaly_label.csv"
        nb_block = 30000

        with open(hdfs_anomaly_label, "r") as f:
            hdfs_labels = {}
            for i, line in tqdm(enumerate(f), total=nb_block):
                label = line.strip().split(",")
                hdfs_labels[label[0]] = (label[1] == "Anomaly")
        keys = random.sample(list(hdfs_labels), nb_block)
        values = [hdfs_labels[k] for k in keys]
        hdfs_labels = dict(zip(keys, values))

        blk_finder_2 = re.compile(r"(blk_-?\d+)")
        with open(hdfs_log, "r") as f:
            data_dict = {key: [] for key in hdfs_labels.keys()}
            for line in tqdm(f):
                blk = re.search(blk_finder_2, line).group()
                if blk in data_dict:
                    msg = " ".join(line.strip().split()[5:])
                    result = self.template_miner.add_log_message(msg)
                    cluster_id = result["cluster_id"] - 1
                    data_dict[blk].append(cluster_id)

        abnormal = []
        normal = []
        abnormal_label = []
        normal_label = []
        abnormal_blk = []

        for blk, seq in data_dict.items():
            if len(seq) > self.window_size:
                labels = seq[self.window_size:]
                seqs = sliding_window_view(seq[:-1], self.window_size)
                if hdfs_labels[blk]:
                    abnormal.append(seqs)
                    abnormal_label.append(labels)
                    abnormal_blk.append(blk)
                else:
                    normal.append(seqs)
                    normal_label.append(labels)

        print("normal : ", len(normal))
        print("abnormal : ", len(abnormal))
        train_seq, test_seq, train_label, test_label = train_test_split(
            normal, normal_label, train_size=0.8)
        train_seq = np.concatenate(train_seq)
        train_label = np.concatenate(train_label)

        if train:
            self.set_dataLoader_training_1(train_seq, train_label)
            self.train()

        # predict

        FP = 0
        TP = 0
        mem = {}
        for seqs, labels in tqdm(zip(test_seq, test_label),
                                 total=len(test_seq)):
            for seq, label in zip(seqs, labels):
                seq_tuple = tuple(seq + [label])
                if seq_tuple in mem:
                    result = mem[seq_tuple]
                else:
                    result = self.predict_seq(seq, label)
                    mem[seq_tuple] = result
                if result:
                    FP += 1
                    break
        for seqs, labels in tqdm(zip(abnormal, abnormal_label),
                                 total=len(abnormal)):
            for seq, label in zip(seqs, labels):
                seq_tuple = tuple(seq + [label])
                if seq_tuple in mem:
                    result = mem[seq_tuple]
                else:
                    result = self.predict_seq(seq, label)
                    mem[seq_tuple] = result
                if result:
                    TP += 1
                    break
        FN = len(abnormal) - TP
        P = 100 * TP / (TP + FP)
        R = 100 * TP / (TP + FN)
        F1 = 2 * P * R / (P + R)
        print('''false positive (FP): {}, false negative (FN): {},
            Precision: {:.3f}%, Recall: {:.3f}%,
            F1-measure: {:.3f}%'''.format(FP, FN, P, R, F1))

    # -------------- drain3 function -----------------

    def log_to_key(self, log):
        result = self.template_miner.add_log_message(log)
        if result["change_type"] != "none":
            pass
        cluster_id = result["cluster_id"] - 1
        return cluster_id

    def get_templates(self):
        return (c.get_template() for c in self.template_miner.drain.clusters)

    def get_number_classes(self):
        return len(list(self.get_templates()))

    def get_word_counter(self):
        d = defaultdict(int)
        for cluster in self.template_miner.drain.clusters:
            for word in preprocess_template(cluster.get_template()):
                d[word] += cluster.size
        return d

    def template_to_vec_all(self):
        d = {}
        d[0] = np.array([-1] * 300)
        word_counter = self.get_word_counter()
        for cluster in self.template_miner.drain.clusters:
            template, template_id = cluster.get_template(), cluster.cluster_id
            d[template_id] = line_to_vec(template, word_counter)
        return d

    def template_to_vec(self, templateID):
        if templateID == 0:
            return np.array([-1] * 300)
        for cluster in self.template_miner.drain.clusters:
            if cluster.cluster_id == templateID:
                word_counter = self.get_word_counter()
                return line_to_vec(cluster.get_template(), word_counter)

        print(templateID)
        raise RuntimeError

    def remove_system(self):
        os.remove(self.persistence_path)
Beispiel #15
0
                    level=logging.INFO,
                    format='%(message)s')

in_gz_file = "SSH.tar.gz"
in_log_file = "SSH.log"
if not os.path.isfile(in_log_file):
    logger.info(f"Downloading file {in_gz_file}")
    p = subprocess.Popen(
        f"curl https://zenodo.org/record/3227177/files/{in_gz_file} --output {in_gz_file}",
        shell=True)
    p.wait()
    logger.info(f"Extracting file {in_gz_file}")
    p = subprocess.Popen(f"tar -xvzf {in_gz_file}", shell=True)
    p.wait()

template_miner = TemplateMiner()

line_count = 0
start_time = time.time()
batch_start_time = start_time
batch_size = 10000
with open(in_log_file) as f:
    for line in f:
        line = line.rstrip()
        line = line.partition(": ")[2]
        result = template_miner.add_log_message(line)
        line_count += 1
        if line_count % batch_size == 0:
            time_took = time.time() - batch_start_time
            rate = batch_size / time_took
            logger.info(
Beispiel #16
0
in_gz_file = "SSH.tar.gz"
in_log_file = "SSH.log"
if not os.path.isfile(in_log_file):
    logger.info(f"Downloading file {in_gz_file}")
    p = subprocess.Popen(
        f"curl https://zenodo.org/record/3227177/files/{in_gz_file} --output {in_gz_file}",
        shell=True)
    p.wait()
    logger.info(f"Extracting file {in_gz_file}")
    p = subprocess.Popen(f"tar -xvzf {in_gz_file}", shell=True)
    p.wait()

config = TemplateMinerConfig()
config.load(dirname(__file__) + "/drain3.ini")
config.profiling_enabled = True
template_miner = TemplateMiner(config=config)

line_count = 0

with open(in_log_file) as f:
    lines = f.readlines()

start_time = time.time()
batch_start_time = start_time
batch_size = 10000

for line in lines:
    line = line.rstrip()
    line = line.partition(": ")[2]
    result = template_miner.add_log_message(line)
    line_count += 1
Beispiel #17
0
config = configparser.ConfigParser()
config.read('drain3.ini')

logger = logging.getLogger(__name__)
logging.basicConfig(stream=sys.stdout,
                    level=logging.INFO,
                    format='%(message)s')

if persistence_type == "KAFKA":
    persistence = KafkaPersistence("localhost:9092", "drain3_state")
elif persistence_type == "FILE":
    persistence = FilePersistence("drain3_state.bin")
else:
    persistence = None

template_miner = TemplateMiner(persistence)
print(
    f"Drain3 started with '{persistence_type}' persistence, reading from std-in (input 'q' to finish)"
)
with open("./record.txt", "w", encoding='UTF-8') as record:
    while True:
        log_line = input()
        if log_line == 'q':
            record.close()
            break
        result = template_miner.add_log_message(log_line)
        result_json = json.dumps(result)
        record.writelines(result_json + '\n')
        print(result_json)

print("Clusters:")
class LogPreprocessor:
    def __init__(self, logs: pd.DataFrame):
        self.logs = logs
        self.template_miner = TemplateMiner()
        self.cleaned_logs = pd.DataFrame
        self.clusters = {}
        self.results = {}
        self.n_clusters = 0

    @staticmethod
    def clean_solr_logs(s: str) -> str:
        if len(s) == 33 or len(s) == 32:
            if 'zoo' in s or 'solr' in s:
                s = s[:8] + ' ' + s[9:22] + ' ' + s[22:]

        return s

    def standardize(self, logs: pd.DataFrame) -> pd.DataFrame:
        fmt = '%Y-%m-%dT%H:%M:%S.%f'
        logs['timestamp'] = pd.to_datetime(logs['timestamp'], format=fmt)

        logger.info('Standardizing log documents ...')

        # remove timestamps
        logs['log'] = logs['log'].replace(
            to_replace=
            r'(?:\d{4}-\d{2}-\d{2}[\sT]\d{2}:\d{2}:\d{2}([.,]\d{3}|\s))',
            value='',
            regex=True)
        #logs['log'] = logs['log'].apply(lambda log: self.clean_solr_logs(log))

        # remove punctuation
        #logs['log'] = logs['log'].replace(to_replace=r'[^\w\s]',
        #                                  value=' ',
        #                                  regex=True)

        logger.info('...complete!')

        return logs

    def generate_clusters(self):
        self.cleaned_logs = self.standardize(self.logs)
        logger.info('Generating log templates ...')

        for idx, row in enumerate(self.cleaned_logs.itertuples()):
            self.results[idx] = self.template_miner.add_log_message(row.log)

        self.clusters = self.template_miner.drain.clusters
        self.n_clusters = len(self.template_miner.drain.clusters)

        # cleaned_clusters = [re.sub(pattern=r'[^\w\s]',
        #                            repl=' ',
        #                            string=cluster.get_template())
        #                     for cluster in self.Drain.drain.clusters]

        cleaned_clusters = [
            re.sub(pattern=r' +', repl=' ', string=cluster.get_template())
            for cluster in self.template_miner.drain.clusters
        ]

        logger.info('...complete!')
        joblib.dump(cleaned_clusters, '/results/clean_clusters.joblib')
        return cleaned_clusters, self.template_miner.drain.clusters

    def generate_word_embeddings(self):
        logger.info('Generating Word Embeddings ...')

        if os.environ["GENERATE_NEW_DRAIN"] == "yes":
            clusters, _ = self.generate_clusters()
        else:
            clusters = joblib.load('/results/clean_clusters.joblib')

        self.word_2_vec.corpus = clusters
        self.word_2_vec.generate_embeddings()