def crf_trainfile(conf, iterobj): from . import lt_crf table = lt_common.TemplateTable() ltgen = lt_crf.init_ltgen_crf(conf, table) l_train = make_crf_train(conf, iterobj) l_buf = [_items.items2str(ltgen.trainitems(lm)) for lm in l_train] return "\n\n".join(l_buf)
def make_crf_train(conf, iterobj, return_ltidlist=False): method = conf["log_template_crf"]["sample_method"] size = conf.getint("log_template_crf", "n_sample") if method == "all": l_train = list(iterobj) elif method == "random": l_train = train_sample_random(iterobj, size) elif method == "ltgen": lt_methods = config.getlist(conf, "log_template_crf", "sample_lt_methods") use_mp = conf.getboolean("log_template_crf", "sample_lt_multiprocess") table = lt_common.TemplateTable() ltgen = amulog.manager.init_ltgen_methods(conf, table, lt_methods, multiprocess=use_mp) l_train = train_sample_ltgen(iterobj, size, ltgen) elif method == "leak": l_train = train_sample_leak(iterobj, size) else: raise NotImplementedError( "Invalid sampling method name {0}".format(method)) if return_ltidlist: train_ltidlist = [lm.lt.ltid for lm in l_train] return l_train, train_ltidlist else: return l_train
def make_crf_model_from_trainfile(conf, fp, output=None): from . import lt_crf table = lt_common.TemplateTable() ltgen = lt_crf.init_ltgen_crf(conf, table) ltgen.init_trainer() model_path = ltgen.train_from_file(fp, output) assert os.path.exists(model_path) _logger.info("generate crf model {0}".format(model_path)) return model_path
def _try_method(self, conf, online=True): table = lt_common.TemplateTable() ltgen = manager.init_ltgen_methods(conf, table) iterobj = manager.iter_plines(conf, [self._path_testlog]) if online: for pline in iterobj: ltgen.process_line(pline) else: d_pline = {mid: pline for mid, pline in enumerate(iterobj)} ltgen.process_offline(d_pline) return table
def generate_lt_file(conf, fp): lp = logparser.LogParser(conf) table = lt_common.TemplateTable() ltgen = lt_common.init_ltgen(conf, table, "crf") with open(fp, 'r') as f: for line in f: line = line.rstrip() dt, org_host, l_w, l_s = lp.process_line(line) tpl = ltgen.estimate_tpl(l_w, l_s) print(line) print(" ".join(tpl)) print("")
def test_tagging(self): from amulog.alg.crf import init_ltgen conf = config.open_config() conf["log_template_crf"]["model_filename"] = self._path_model table = lt_common.TemplateTable() ltgen = init_ltgen(conf, table) ltgen.init_trainer() ltgen.train_from_file(self._path_trainfile) tmp_pline = {"words": self.data_test} tpl = ltgen.generate_tpl(tmp_pline) self.assertTrue("ssh" in tpl) self.assertTrue(lt_common.REPLACER in tpl)
def make_crf_model(conf, iterobj, output=None, return_sampled_messages=False): from . import lt_crf table = lt_common.TemplateTable() ltgen = lt_crf.init_ltgen_crf(conf, table) l_train = make_crf_train(conf, iterobj) ltgen.init_trainer() model_path = ltgen.train(l_train, output) assert os.path.exists(model_path) _logger.info("generate crf model {0}".format(model_path)) if return_sampled_messages: return model_path, l_train else: return model_path
def test_tagging(self): conf = config.open_config() sym = conf.get("log_template", "variable_symbol") table = lt_common.TemplateTable() #converter = convert.FeatureExtracter() ltgen = lt_crf.LTGenCRF(table, sym, conf) l_items = [] for data_line in self.data_train: lineitem = [item.split() for item in data_line] l_items.append(lineitem) ltgen.init_trainer() ltgen.train(l_items) tid, state = ltgen.process_line(self.data_test, None) tpl = ltgen._table.get_template(tid) self.assertTrue("ssh" in tpl) self.assertTrue(sym in tpl) common.rm(ltgen.model)
def measure_parameters(conf, targets, method): param_candidates = list(_get_param_candidates(method)) n_trial = len(param_candidates) ps = ParameterSearcher(conf, n_trial) ps.load() from amulog import log_db for trial_id, params in enumerate(param_candidates): timer = common.Timer("measure-parameters trial{0}".format( trial_id), output=_logger) timer.start() ps.init_trial(trial_id, params) table = lt_common.TemplateTable() ltgen = _init_ltgen_with_params(conf, table, method, params) input_lines = list(amulog.manager.iter_plines(conf, targets)) d_tid = ltgen.process_offline(input_lines) iterobj = zip(input_lines, ps.tid_list_answer(), ps.iter_tpl_answer()) for mid, (pline, tid_answer, tpl_answer) in enumerate(iterobj): if tid_answer is None: tid_trial = None tpl_trial = None else: tid_trial = d_tid[mid] if tid_trial is None: tpl_trial = None else: try: tpl_trial = ltgen.get_tpl(tid_trial) except: import pdb; pdb.set_trace() ps.add_trial(tid_trial, tpl_trial, tid_answer, tpl_answer, pline["words"]) ps.dump_trial() timer.stop() return ps
def __init__(self, conf, db, lttable, reset_db=False, parallel=False): self._conf = conf self._reset_db = reset_db self._filename = conf["manager"]["indata_filename"] self._fail_output = conf["manager"]["fail_output"] self._online_batchsize = conf.getint("manager", "online_batchsize") self._online_counter = 0 self._offline_batchsize = conf.getint("manager", "offline_batchsize") self._drop_undefhost = conf.getboolean("manager", "undefined_host") self._shuffle_import = conf.getboolean("log_template_import", "shuffle") self._db = db self._lttable = lttable self._table = lt_common.TemplateTable() self._ltgen: Optional[lt_common.LTGen] = None self._pool = None if parallel: tmp_n_proc = conf["manager"]["n_process"] if tmp_n_proc.isdigit(): n_proc = int(tmp_n_proc) else: n_proc = os.cpu_count() ltgen_kwargs = {"conf": conf, "table": None, # individual table for child process "shuffle": self._shuffle_import} from multiprocessing import Pool self._pool = Pool(processes=n_proc, initializer=self._init_pool, initargs=(ltgen_kwargs,)) else: self._lp = load_log2seq(self._conf) self._ha = host_alias.init_hostalias(self._conf) self._drop_undefhost = conf.getboolean("manager", "undefined_host") self._ltgen = init_ltgen_methods(self._conf, self._table) self._ltgroup = init_ltgroup(self._conf, self._table) if not self._reset_db: self._ltgroup.restore_ltg(self._db, self._lttable)