def test_candidates(): mm_engine = ex.create_mm_engine(openslex_file_path) rs = cn.get_relationships(mm_engine) assert rs.__len__() == 8 for r in rs: assert r['rs'] in [ 'CONCERT_HALL_FK', 'SEAT_HALL_FK', 'BP_CONCERT_FK', 'BP_BAND_FK', 'TICKET_CONCERT', 'TICKET_SEAT', 'BOOKING_CON', 'BOOKING_FK' ] stats = cn.get_stats_mm(mm_engine) os.makedirs('output/dumps/', exist_ok=True) json.dump(stats, open('output/dumps/stats_mm.json', 'wt'), indent=True) candidates = cn.compute_candidates(mm_engine) # json.dump(candidates, open('output/dumps/candidates.json', 'wt'), indent=True) random.seed(0) random.shuffle(candidates) for idx, cand in enumerate( list(candidates.values())[:min(5, candidates.__len__())]): log_name = 'log_test_{}'.format(idx) print('Computing Log: {}'.format(log_name)) cn.build_log_for_case_notion(mm_engine, cand, proc_name='proc_test_{}'.format(idx), log_name=log_name) assert candidates.__len__() > 0
def check_mm(openslex_file_path, connection_params, metadata): mm_engine = ex.create_mm_engine(openslex_file_path) mm_conn = mm_engine.connect() db_engine = ex.create_db_engine(**connection_params) db_conn = db_engine.connect() check = True for t in metadata.tables.keys(): mm_q = sq.select([text('count(O.id) as num')]).select_from(text('object as O, class as CL')) \ .where(text('O.class_id == CL.id and CL.name == "{}"'.format(t))) mm_res = mm_conn.execute(mm_q) mm_num = mm_res.first() mm_res.close() db_q = sq.select([text('count(*) as num')]).select_from(text(t)) db_res = db_conn.execute(db_q) db_num = db_res.first() db_res.close() check_t = mm_num['num'] == db_num['num'] check = check and check_t assert check mm_conn.close() mm_engine.dispose() db_conn.close() db_engine.dispose() print(check) return check
def test_default_model(openslex=train_openslex_file_path, ground_truth=ground_truth_path): mm_engine = ex.create_mm_engine(openslex) mm_meta = ex.get_mm_meta(mm_engine) disc = ev.discover_event_definitions(mm_engine, mm_meta, model='default') aid = disc['aid'] y_true_in_table = aid.load_y_true(disc[ev.CT_IN_TABLE]['candidates'], ground_truth) y_true_lookup = aid.load_y_true(disc[ev.CT_LOOKUP]['candidates'], ground_truth) y_true_ts_fields = aid.load_y_true(disc[ev.CT_TS_FIELD]['candidates'], ground_truth) scores_ts_fields = aid.score(y_true_ts_fields, disc[ev.CT_TS_FIELD]['predicted']) scores_in_table = aid.score(y_true_in_table, disc[ev.CT_IN_TABLE]['predicted']) scores_lookup = aid.score(y_true_lookup, disc[ev.CT_LOOKUP]['predicted']) print('Score Ts Fields') pprint(scores_ts_fields) print('Score In Table') pprint(scores_in_table) print('Score Lookup') pprint(scores_lookup)
def export_log(mm_path, log_id, output_file): mm_engine = ex.create_mm_engine(mm_path) mm_meta = ex.get_mm_meta(mm_engine) df: pd.DataFrame = cn.log_to_dataframe(mm_engine=mm_engine, mm_meta=mm_meta, log_id=log_id) df.to_csv(output_file, index_label='idx')
def print_cn(mm_path, cn_id, cn_dir, output_file, view): mm_engine = ex.create_mm_engine(mm_path) mm_meta = ex.get_mm_meta(mm_engine) try: candidates = pickle.load(open(Path(cn_dir, 'candidates.pkl'), 'rb')) case_notion = candidates[cn_id] dump_cn_dot(mm_engine, mm_meta, case_notion, output_file, view) except: raise Exception('No case notion to show')
def print_cn_log(mm_path, log_id, output_file, view): mm_engine = ex.create_mm_engine(mm_path) mm_meta = ex.get_mm_meta(mm_engine) info = cn.log_info(mm_engine, mm_meta, log_id) if 'case_notion' in info['attributes']: dump_cn_dot(mm_engine, mm_meta, info['attributes']['case_notion'], output_file, view) else: raise Exception('No case notion to show')
def test_candidates(): mm_engine = ex.create_mm_engine(train_openslex_file_path) mm_meta = ex.get_mm_meta(mm_engine) os.makedirs(dumps_dir, exist_ok=True) classes = None model = ev.train_model(mm_engine=mm_engine, mm_meta=mm_meta, y_true_path=ground_truth_path, classes=classes, model_output=trained_model) disc = ev.discover_event_definitions(mm_engine=mm_engine, mm_meta=mm_meta, classes=classes, dump_dir=dumps_dir, model=model) shutil.copyfile(train_openslex_file_path, modified_mm_path) mm_engine_modif = ex.create_mm_engine(modified_mm_path) mm_meta_modif = ex.get_mm_meta(mm_engine_modif) pred_ts_cand = [ c for p, c in zip(disc[ev.CT_TS_FIELD]['predicted'], disc[ ev.CT_TS_FIELD]['candidates']) if p == 1 ] pred_in_table_cand = [ c for p, c in zip(disc[ev.CT_IN_TABLE]['predicted'], disc[ ev.CT_IN_TABLE]['candidates']) if p == 1 ] pred_lookup_cand = [ c for p, c in zip(disc[ev.CT_LOOKUP]['predicted'], disc[ev.CT_LOOKUP] ['candidates']) if p == 1 ] ev.compute_events(mm_engine_modif, mm_meta_modif, pred_ts_cand[:2]) ev.compute_events(mm_engine_modif, mm_meta_modif, pred_in_table_cand[:2]) ev.compute_events(mm_engine_modif, mm_meta_modif, pred_lookup_cand[:2])
def disc_and_build(mm: Path, new_mm: Path, dump_dir: Path, build_events=False): print("Discovering and building events for: {}".format(mm)) print("In: {}".format(new_mm)) print("Dumping in: {}".format(dump_dir)) mm_engine_train = ex.create_mm_engine(mm) mm_meta_train = ex.get_mm_meta(mm_engine_train) cached_dir_train = dump_dir ts_train_path = cached_dir_train / 'timestamps.json' candidates_ts_fields_path = cached_dir_train / 'candidates_ts_fields.json' candidates_in_table_path = cached_dir_train / 'candidates_in_table.json' candidates_lookup_path = cached_dir_train / 'candidates_lookup.json' features_in_table_path = cached_dir_train / 'features_in_table.json' features_lookup_path = cached_dir_train / 'features_lookup.json' final_candidates_ts_fields_path = cached_dir_train / 'final_candidates_ts_fields.json' final_candidates_in_table_path = cached_dir_train / 'final_candidates_in_table.json' final_candidates_lookup_path = cached_dir_train / 'final_candidates_lookup.json' os.makedirs(cached_dir_train, exist_ok=True) aid = ev.ActivityIdentifierDiscoverer(engine=mm_engine_train, meta=mm_meta_train, model='default') if os.path.exists(ts_train_path): timestamp_attrs = aid.load_timestamp_attributes(ts_train_path) else: timestamp_attrs = aid.get_timestamp_attributes() aid.save_timestamp_attributes(timestamp_attrs, ts_train_path) # if os.path.exists(candidates_ts_fields_path): candidates_ts_fields = aid.load_candidates(candidates_ts_fields_path) else: candidates_ts_fields = aid.generate_candidates( timestamp_attrs=timestamp_attrs, candidate_type=ev.CT_TS_FIELD) aid.save_candidates(candidates_ts_fields, candidates_ts_fields_path) if os.path.exists(candidates_in_table_path): candidates_in_table = aid.load_candidates(candidates_in_table_path) else: candidates_in_table = aid.generate_candidates( timestamp_attrs=timestamp_attrs, candidate_type=ev.CT_IN_TABLE) aid.save_candidates(candidates_in_table, candidates_in_table_path) if os.path.exists(candidates_lookup_path): candidates_lookup = aid.load_candidates(candidates_lookup_path) else: candidates_lookup = aid.generate_candidates( timestamp_attrs=timestamp_attrs, candidate_type=ev.CT_LOOKUP) aid.save_candidates(candidates_lookup, candidates_lookup_path) # if os.path.exists(features_in_table_path): X_in_table = aid.load_features(features_in_table_path) else: X_in_table = aid.compute_features(candidates_in_table, verbose=1) aid.save_features(X_in_table, features_in_table_path) if os.path.exists(features_lookup_path): X_lookup = aid.load_features(features_lookup_path) else: X_lookup = aid.compute_features(candidates_lookup, verbose=1) aid.save_features(X_lookup, features_lookup_path) # if os.path.exists(final_candidates_ts_fields_path): final_candidates_ts_fields = json.load( open(final_candidates_ts_fields_path, 'rt')) else: predicted_ts_fields = [1 for c in candidates_ts_fields] final_candidates_ts_fields = [ c for p, c in zip(predicted_ts_fields, candidates_ts_fields) if p == 1 ] json.dump(final_candidates_ts_fields, open(final_candidates_ts_fields_path, 'wt'), indent=True) if os.path.exists(final_candidates_in_table_path): final_candidates_in_table = json.load( open(final_candidates_in_table_path, 'rt')) else: predicted_in_table = aid.predict(X_in_table, candidate_type=ev.CT_IN_TABLE) final_candidates_in_table = [ c for p, c in zip(predicted_in_table, candidates_in_table) if p == 1 ] json.dump(final_candidates_in_table, open(final_candidates_in_table_path, 'wt'), indent=True) if os.path.exists(final_candidates_lookup_path): final_candidates_lookup = json.load( open(final_candidates_lookup_path, 'rt')) else: predicted_lookup = aid.predict(X_lookup, candidate_type=ev.CT_LOOKUP) final_candidates_lookup = [ c for p, c in zip(predicted_lookup, candidates_lookup) if p == 1 ] json.dump(final_candidates_lookup, open(final_candidates_lookup_path, 'wt'), indent=True) if build_events: shutil.copyfile(mm, new_mm) mm_engine_modif = ex.create_mm_engine(new_mm) mm_meta_modif = ex.get_mm_meta(mm_engine_modif) ev.compute_events(mm_engine_modif, mm_meta_modif, final_candidates_ts_fields) ev.compute_events(mm_engine_modif, mm_meta_modif, final_candidates_in_table) ev.compute_events(mm_engine_modif, mm_meta_modif, final_candidates_lookup)
def info_log(mm_path, log_id): mm_engine = ex.create_mm_engine(mm_path) mm_meta = ex.get_mm_meta(mm_engine) info = cn.log_info(mm_engine, mm_meta, log_id) pprint(info)
def list_logs(mm_path): mm_engine = ex.create_mm_engine(mm_path) mm_meta = ex.get_mm_meta(mm_engine) logs = cn.list_logs(mm_engine, mm_meta) pprint(logs)
def case_notion_candidates_cached(mm_path, dump_dir, build_logs=False, topk=None): mm_engine = ex.create_mm_engine(mm_path) mm_meta = ex.get_mm_meta(mm_engine) os.makedirs(dump_dir, exist_ok=True) class_stats_path = Path(dump_dir, 'class_stats_mm.json') candidates_path = Path(dump_dir, 'candidates.pkl') bounds_path = Path(dump_dir, 'bounds.json') prediction_path = Path(dump_dir, 'prediction.json') params_path = Path(dump_dir, 'params.json') ranking_path = Path(dump_dir, 'ranking.json') mm_modif_path = Path(dump_dir, 'mm-modif-logs.slexmm') if os.path.isfile(class_stats_path): class_stats = json.load( open('{}/class_stats_mm.json'.format(dump_dir), 'rt')) else: class_stats = cn.get_stats_mm(mm_engine) json.dump(class_stats, open('{}/class_stats_mm.json'.format(dump_dir), 'wt'), indent=True) if os.path.isfile(candidates_path): candidates_mem = cn.load_candidates(candidates_path) else: candidates = cn.compute_candidates(mm_engine, cache_dir=dump_dir) candidates_mem = cn.save_candidates(candidates, candidates_path) if os.path.isfile(bounds_path): bounds = json.load(open(bounds_path, 'rt')) else: bounds = cn.compute_bounds_of_candidates(candidates_mem, mm_engine, mm_meta, class_stats) json.dump(bounds, open(bounds_path, 'wt'), indent=True) if os.path.isfile(prediction_path): pred = json.load(open(prediction_path, 'rt')) else: pred = cn.compute_prediction_from_bounds(bounds, 0.5, 0.5, 0.5) json.dump(pred, open(prediction_path, 'wt'), indent=True) if os.path.isfile(params_path): params = json.load(open(params_path, 'rt')) else: params = { 'mode_sp': None, 'max_sp': None, 'min_sp': None, 'mode_lod': None, 'max_lod': 10, 'min_lod': 3, 'mode_ae': None, 'max_ae': 3000, 'min_ae': 0, 'w_sp': 0.33, 'w_lod': 0.33, 'w_ae': 0.33, } json.dump(params, open(params_path, 'wt'), indent=True) pprint(params) if os.path.isfile(ranking_path): detailed_ranking = pd.DataFrame.from_csv(open(ranking_path, 'rt')) else: detailed_ranking = cn.compute_detailed_ranking(pred, **params) detailed_ranking.to_csv(open(ranking_path, 'wt')) ranking = detailed_ranking['cn_id'] pprint(detailed_ranking) if build_logs: shutil.copyfile(mm_path, mm_modif_path) mm_engine_modif = ex.create_mm_engine(mm_modif_path) mm_meta_modif = ex.get_mm_meta(mm_engine_modif) if topk: ranking_to_build = ranking[:topk] else: ranking_to_build = ranking for idx in ranking_to_build: c = candidates_mem[idx] proc_name = 'proc_{}'.format(idx) log_name = 'log_{}'.format(idx) print('Building Log: {}'.format(log_name)) cn.build_log_for_case_notion(mm_engine_modif, c, proc_name=proc_name, log_name=log_name, metadata=mm_meta_modif)
def test_candidates(): shutil.copyfile(openslex_file_path_orig, openslex_file_path) mm_engine = ex.create_mm_engine(openslex_file_path) metadata = ex.get_mm_meta(mm_engine) rs = cn.get_relationships(mm_engine) assert rs.__len__() == 8 for r in rs: assert r['rs'] in [ 'CONCERT_HALL_FK', 'SEAT_HALL_FK', 'BP_CONCERT_FK', 'BP_BAND_FK', 'TICKET_CONCERT', 'TICKET_SEAT', 'BOOKING_CON', 'BOOKING_FK' ] class_stats = cn.get_stats_mm(mm_engine) os.makedirs('output/dumps/', exist_ok=True) json.dump(class_stats, open('output/dumps/stats_mm.json', 'wt'), indent=True) candidates = cn.compute_candidates(mm_engine) json.dump(candidates, open('output/dumps/candidates.json', 'wt'), indent=True) candidates = json.load(open('output/dumps/candidates.json', 'rt'), object_hook=cn.CaseNotion) for idx, cand in enumerate(candidates): log_name = 'log_test_{}'.format(idx) print('Computing Log: {}'.format(log_name)) log_id = cn.build_log_for_case_notion( mm_engine, cand, proc_name='proc_test_{}'.format(idx), log_name=log_name, metadata=metadata) sp = cn.compute_support_log(mm_engine, log_id, metadata) print('Support for {}: {}'.format(log_id, sp)) ae = cn.compute_ae_log(mm_engine, log_id, metadata, sp) print('AE for {}: {}'.format(log_id, ae)) lod = cn.compute_lod_log(mm_engine, log_id, metadata, sp) print('LoD for {}: {}'.format(log_id, lod)) sp_lb = cn.compute_lb_support_cn(mm_engine, cand, metadata, class_stats) print('Lower Bound of Support for {}: {}'.format(idx, sp_lb)) sp_ub = cn.compute_ub_support_cn(mm_engine, cand, metadata, class_stats) print('Upper Bound of Support for {}: {}'.format(idx, sp_ub)) lod_lb = cn.compute_lb_lod_cn(mm_engine, cand, metadata, class_stats) print('Lower Bound of LoD for {}: {}'.format(idx, lod_lb)) lod_ub = cn.compute_ub_lod_cn(mm_engine, cand, metadata, class_stats) print('Upper Bound of LoD for {}: {}'.format(idx, lod_ub)) ae_lb = cn.compute_lb_ae_cn(mm_engine, cand, metadata, class_stats) print('Lower Bound of AE for {}: {}'.format(idx, ae_lb)) ae_ub = cn.compute_ub_ae_cn(mm_engine, cand, metadata, class_stats) print('Upper Bound of AE for {}: {}'.format(idx, ae_ub)) assert sp_ub >= sp >= sp_lb assert lod_ub >= lod >= lod_lb assert ae_ub >= ae >= ae_lb assert candidates.__len__() > 0
def evaluate_cn(mm_filepath: str, output_dir): os.makedirs(output_dir, exist_ok=True) mm_filepath_tmp = '{}/mm-modif.slexmm'.format(output_dir) df_filepath = '{}/df.json'.format(output_dir) df_pickle_file = '{}/df.pickle'.format(output_dir) class_stats_filepath = '{}/class_stats.json'.format(output_dir) cands_filepath = '{}/candidates.json'.format(output_dir) shutil.copyfile(mm_filepath, mm_filepath_tmp) mm_engine = edex.create_mm_engine(mm_filepath_tmp) mm_meta = edex.get_mm_meta(mm_engine) cands = edcn.compute_candidates(mm_engine=mm_engine, max_length_path=5, cache_dir=output_dir) stats = { 'proc_name': [], 'log_name': [], 'log_id': [], 'log_sp': [], 'log_lod': [], 'log_ae': [], 'cn_sp_lb': [], 'cn_sp_ub': [], 'cn_lod_lb': [], 'cn_lod_ub': [], 'cn_ae_lb': [], 'cn_ae_ub': [], 'num_classes': [], 'e': [], 'ir': [], } class_stats = edcn.get_stats_mm(mm_engine, mm_meta) json.dump(class_stats, open(class_stats_filepath, mode='wt'), indent=True) cn: CaseNotion for i, cn in enumerate(tqdm(cands.values(), total=len(cands), desc='Candidates')): proc_name = 'proc-{}'.format(i) log_name = 'log-{}'.format(i) log_id = edcn.build_log_for_case_notion( mm_engine, cn, proc_name, log_name, mm_meta) log_sp = edcn.compute_support_log(mm_engine, log_id, mm_meta) log_lod = edcn.compute_lod_log(mm_engine, log_id, mm_meta) log_ae = edcn.compute_ae_log(mm_engine, log_id, mm_meta) cn_sp_lb = edcn.compute_lb_support_cn(mm_engine, cn, mm_meta, class_stats) cn_sp_ub = edcn.compute_ub_support_cn(mm_engine, cn, mm_meta, class_stats) cn_lod_lb = edcn.compute_lb_lod_cn(mm_engine, cn, mm_meta, class_stats) cn_lod_ub = edcn.compute_ub_lod_cn(mm_engine, cn, mm_meta, class_stats) cn_ae_lb = edcn.compute_lb_ae_cn(mm_engine, cn, mm_meta, class_stats) cn_ae_ub = edcn.compute_ub_ae_cn(mm_engine, cn, mm_meta, class_stats) stats['proc_name'].append(proc_name) stats['log_name'].append(log_name) stats['log_id'].append(log_id) stats['log_sp'].append(log_sp) stats['log_lod'].append(log_lod) stats['log_ae'].append(log_ae) stats['cn_sp_lb'].append(cn_sp_lb) stats['cn_sp_ub'].append(cn_sp_ub) stats['cn_lod_lb'].append(cn_lod_lb) stats['cn_lod_ub'].append(cn_lod_ub) stats['cn_ae_lb'].append(cn_ae_lb) stats['cn_ae_ub'].append(cn_ae_ub) stats['num_classes'].append(cn.get_classes_ids().__len__()) num_e = 0 sum_e_per_o = 0 for c_id in cn.get_classes_ids(): num_e = num_e + class_stats[str(c_id)]['e'] e_per_o = class_stats[str(c_id)]['ev_o'] num_o = class_stats[str(c_id)]['o_w_ev'] sum_e_per_o = float(sum_e_per_o) + float(float(e_per_o) / float(max(num_o, 1))) stats['e'].append(num_e) stats['ir'].append(sum_e_per_o / cn.get_classes_ids().__len__()) df = pd.DataFrame(stats) df.to_json(df_filepath) pickle.dump(df, open(df_pickle_file, mode='wb'))
def test_candidates_cached(): mm_engine_train = ex.create_mm_engine(train_openslex_file_path) mm_meta_train = ex.get_mm_meta(mm_engine_train) cached_dir_train = 'output/adw/ev_disc' ts_train_path = '{}/timestamps.json'.format(cached_dir_train) candidates_ts_fields_path = '{}/candidates_ts_fields.json'.format( cached_dir_train) candidates_in_table_path = '{}/candidates_in_table.json'.format( cached_dir_train) candidates_lookup_path = '{}/candidates_lookup.json'.format( cached_dir_train) features_in_table_path = '{}/features_in_table.json'.format( cached_dir_train) features_lookup_path = '{}/features_lookup.json'.format(cached_dir_train) os.makedirs(cached_dir_train, exist_ok=True) aid = ev.ActivityIdentifierDiscoverer(engine=mm_engine_train, meta=mm_meta_train, model='default') if os.path.exists(ts_train_path): timestamp_attrs = aid.load_timestamp_attributes(ts_train_path) else: timestamp_attrs = aid.get_timestamp_attributes() aid.save_timestamp_attributes(timestamp_attrs, ts_train_path) # if os.path.exists(candidates_ts_fields_path): candidates_ts_fields = aid.load_candidates(candidates_ts_fields_path) else: candidates_ts_fields = aid.generate_candidates( timestamp_attrs=timestamp_attrs, candidate_type=ev.CT_TS_FIELD) aid.save_candidates(candidates_ts_fields, candidates_ts_fields_path) if os.path.exists(candidates_in_table_path): candidates_in_table = aid.load_candidates(candidates_in_table_path) else: candidates_in_table = aid.generate_candidates( timestamp_attrs=timestamp_attrs, candidate_type=ev.CT_IN_TABLE) aid.save_candidates(candidates_in_table, candidates_in_table_path) if os.path.exists(candidates_lookup_path): candidates_lookup = aid.load_candidates(candidates_lookup_path) else: candidates_lookup = aid.generate_candidates( timestamp_attrs=timestamp_attrs, candidate_type=ev.CT_LOOKUP) aid.save_candidates(candidates_lookup, candidates_lookup_path) # if os.path.exists(features_in_table_path): X_in_table = aid.load_features(features_in_table_path) else: X_in_table = aid.compute_features(candidates_in_table, verbose=1) aid.save_features(X_in_table, features_in_table_path) if os.path.exists(features_lookup_path): X_lookup = aid.load_features(features_lookup_path) else: X_lookup = aid.compute_features(candidates_lookup, verbose=1) aid.save_features(X_lookup, features_lookup_path) # predicted_ts_fields = [1 for c in candidates_ts_fields] predicted_in_table = aid.predict(X_in_table, candidate_type=ev.CT_IN_TABLE) predicted_lookup = aid.predict(X_lookup, candidate_type=ev.CT_LOOKUP) shutil.copyfile(train_openslex_file_path, modified_mm_path) mm_engine_modif = ex.create_mm_engine(modified_mm_path) mm_meta_modif = ex.get_mm_meta(mm_engine_modif) pred_ts_cand = [ c for p, c in zip(predicted_ts_fields, candidates_ts_fields) if p == 1 ] pred_in_table_cand = [ c for p, c in zip(predicted_in_table, candidates_in_table) if p == 1 ] pred_lookup_cand = [ c for p, c in zip(predicted_lookup, candidates_lookup) if p == 1 ] ev.compute_events(mm_engine_modif, mm_meta_modif, pred_ts_cand[:2]) ev.compute_events(mm_engine_modif, mm_meta_modif, pred_in_table_cand[:2]) ev.compute_events(mm_engine_modif, mm_meta_modif, pred_lookup_cand[:2])
def test_trained_model_cached(openslex_train=train_openslex_file_path, ground_truth_train=ground_truth_path, openslex_test=train_openslex_file_path, ground_truth_test=ground_truth_path, cached_dir_train='output/A/ev_disc', cached_dir_test='output/B/ev_disc'): ts_train_path = '{}/timestamps.json'.format(cached_dir_train) candidates_ts_fields_path = '{}/candidates_ts_fields.json'.format( cached_dir_train) candidates_in_table_path = '{}/candidates_in_table.json'.format( cached_dir_train) candidates_lookup_path = '{}/candidates_lookup.json'.format( cached_dir_train) features_in_table_path = '{}/features_in_table.json'.format( cached_dir_train) features_lookup_path = '{}/features_lookup.json'.format(cached_dir_train) model_trained_path = '{}/model.pkl'.format(cached_dir_train) ts_test_path = '{}/timestamps.json'.format(cached_dir_test) candidates_test_ts_fields_path = '{}/candidates_ts_fields.json'.format( cached_dir_test) candidates_test_in_table_path = '{}/candidates_in_table.json'.format( cached_dir_test) candidates_test_lookup_path = '{}/candidates_lookup.json'.format( cached_dir_test) features_test_in_table_path = '{}/features_in_table.json'.format( cached_dir_test) features_test_lookup_path = '{}/features_lookup.json'.format( cached_dir_test) if not os.path.exists(cached_dir_test): os.makedirs(cached_dir_test) if not os.path.exists(cached_dir_train): os.makedirs(cached_dir_train) mm_engine_train = ex.create_mm_engine(openslex_train) mm_meta_train = ex.get_mm_meta(mm_engine_train) aid = ev.ActivityIdentifierDiscoverer(engine=mm_engine_train, meta=mm_meta_train, model=None) if os.path.exists(ts_train_path): timestamp_attrs = aid.load_timestamp_attributes(ts_train_path) else: timestamp_attrs = aid.get_timestamp_attributes() aid.save_timestamp_attributes(timestamp_attrs, ts_train_path) # if os.path.exists(candidates_ts_fields_path): candidates_ts_fields = aid.load_candidates(candidates_ts_fields_path) else: candidates_ts_fields = aid.generate_candidates( timestamp_attrs=timestamp_attrs, candidate_type=ev.CT_TS_FIELD) aid.save_candidates(candidates_ts_fields, candidates_ts_fields_path) if os.path.exists(candidates_in_table_path): candidates_in_table = aid.load_candidates(candidates_in_table_path) else: candidates_in_table = aid.generate_candidates( timestamp_attrs=timestamp_attrs, candidate_type=ev.CT_IN_TABLE) aid.save_candidates(candidates_in_table, candidates_in_table_path) if os.path.exists(candidates_lookup_path): candidates_lookup = aid.load_candidates(candidates_lookup_path) else: candidates_lookup = aid.generate_candidates( timestamp_attrs=timestamp_attrs, candidate_type=ev.CT_LOOKUP) aid.save_candidates(candidates_lookup, candidates_lookup_path) # if os.path.exists(features_in_table_path): X_in_table = aid.load_features(features_in_table_path) else: X_in_table = aid.compute_features(candidates_in_table, verbose=1) aid.save_features(X_in_table, features_in_table_path) if os.path.exists(features_lookup_path): X_lookup = aid.load_features(features_lookup_path) else: X_lookup = aid.compute_features(candidates_lookup, verbose=1) aid.save_features(X_lookup, features_lookup_path) # y_true_train_in_table = aid.load_y_true(candidates_in_table, y_true_path=ground_truth_train) y_true_train_lookup = aid.load_y_true(candidates_lookup, y_true_path=ground_truth_train) try: class_weight_in_table = compute_class_weight('balanced', [0, 1], y_true_train_in_table) except: class_weight_in_table = [1.0, 1.0] try: class_weight_lookup = compute_class_weight('balanced', [0, 1], y_true_train_lookup) except: class_weight_lookup = [1.0, 1.0] classifiers = { ev.CT_IN_TABLE: ev.make_sklearn_pipeline( XGBClassifier(max_depth=2, n_estimators=10, random_state=1, scale_pos_weight=class_weight_in_table[1])), ev.CT_LOOKUP: ev.make_sklearn_pipeline( XGBClassifier(max_depth=2, n_estimators=10, random_state=1, scale_pos_weight=class_weight_lookup[1])) } aid.set_model(classifiers) aid.train_model(X_in_table, y_true_train_in_table, candidate_type=ev.CT_IN_TABLE) aid.train_model(X_lookup, y_true_train_lookup, candidate_type=ev.CT_LOOKUP) y_pred_ts_fields = [1 for c in candidates_ts_fields] y_pred_in_table = aid.predict(X_in_table, candidate_type=ev.CT_IN_TABLE) y_pred_lookup = aid.predict(X_lookup, candidate_type=ev.CT_LOOKUP) scores_train_in_table = aid.score(y_true_train_in_table, y_pred_in_table) scores_train_lookup = aid.score(y_true_train_lookup, y_pred_lookup) print('Scores In Table') pprint(scores_train_in_table) print('Scores Lookup') pprint(scores_train_lookup) model_trained = classifiers with open(model_trained_path, mode='wb') as f: pickle.dump(classifiers, f) mm_engine_test = ex.create_mm_engine(openslex_test) mm_meta_test = ex.get_mm_meta(mm_engine_test) aid_test = ev.ActivityIdentifierDiscoverer(engine=mm_engine_test, meta=mm_meta_test, model=model_trained) if os.path.exists(ts_test_path): timestamp_attrs_test = aid_test.load_timestamp_attributes(ts_test_path) else: timestamp_attrs_test = aid_test.get_timestamp_attributes() aid_test.save_timestamp_attributes(timestamp_attrs_test, ts_test_path) # if os.path.exists(candidates_test_ts_fields_path): candidates_test_ts_fields = aid_test.load_candidates( candidates_test_ts_fields_path) else: candidates_test_ts_fields = aid_test.generate_candidates( timestamp_attrs=timestamp_attrs_test, candidate_type=ev.CT_IN_TABLE) aid_test.save_candidates(candidates_test_ts_fields, candidates_test_ts_fields_path) if os.path.exists(candidates_test_in_table_path): candidates_test_in_table = aid_test.load_candidates( candidates_test_in_table_path) else: candidates_test_in_table = aid_test.generate_candidates( timestamp_attrs=timestamp_attrs_test, candidate_type=ev.CT_IN_TABLE) aid_test.save_candidates(candidates_test_in_table, candidates_test_in_table_path) if os.path.exists(candidates_test_lookup_path): candidates_test_lookup = aid_test.load_candidates( candidates_test_lookup_path) else: candidates_test_lookup = aid_test.generate_candidates( timestamp_attrs=timestamp_attrs_test, candidate_type=ev.CT_LOOKUP) aid_test.save_candidates(candidates_test_lookup, candidates_test_lookup_path) # if os.path.exists(features_test_in_table_path): feature_values_in_table_test = aid_test.load_features( features_test_in_table_path) else: feature_values_in_table_test = aid_test.compute_features( candidates_test_in_table, verbose=True) aid_test.save_features(feature_values_in_table_test, features_test_in_table_path) if os.path.exists(features_test_lookup_path): feature_values_lookup_test = aid_test.load_features( features_test_lookup_path) else: feature_values_lookup_test = aid_test.compute_features( candidates_test_lookup, verbose=True) aid_test.save_features(feature_values_lookup_test, features_test_lookup_path) # pred_test_ts_fields = [1 for c in candidates_test_ts_fields] pred_test_in_table = aid_test.predict(feature_values_in_table_test, candidate_type=ev.CT_IN_TABLE) pred_test_lookup = aid_test.predict(feature_values_lookup_test, candidate_type=ev.CT_LOOKUP) y_true_in_table = aid_test.load_y_true(candidates_test_in_table, ground_truth_test) y_true_lookup = aid_test.load_y_true(candidates_test_lookup, ground_truth_test) y_true_ts_fields = aid_test.load_y_true(candidates_test_ts_fields, ground_truth_test) scores_ts_fields = aid_test.score(y_true_ts_fields, pred_test_ts_fields) scores_in_table = aid_test.score(y_true_in_table, pred_test_in_table) scores_lookup = aid_test.score(y_true_lookup, pred_test_lookup) print('Score Ts Fields') pprint(scores_ts_fields) print('Score In Table') pprint(scores_in_table) print('Score Lookup') pprint(scores_lookup)