コード例 #1
0
def test_candidates():
    mm_engine = ex.create_mm_engine(openslex_file_path)
    rs = cn.get_relationships(mm_engine)
    assert rs.__len__() == 8
    for r in rs:
        assert r['rs'] in [
            'CONCERT_HALL_FK', 'SEAT_HALL_FK', 'BP_CONCERT_FK', 'BP_BAND_FK',
            'TICKET_CONCERT', 'TICKET_SEAT', 'BOOKING_CON', 'BOOKING_FK'
        ]

    stats = cn.get_stats_mm(mm_engine)

    os.makedirs('output/dumps/', exist_ok=True)

    json.dump(stats, open('output/dumps/stats_mm.json', 'wt'), indent=True)

    candidates = cn.compute_candidates(mm_engine)

    # json.dump(candidates, open('output/dumps/candidates.json', 'wt'), indent=True)

    random.seed(0)

    random.shuffle(candidates)

    for idx, cand in enumerate(
            list(candidates.values())[:min(5, candidates.__len__())]):
        log_name = 'log_test_{}'.format(idx)
        print('Computing Log: {}'.format(log_name))
        cn.build_log_for_case_notion(mm_engine,
                                     cand,
                                     proc_name='proc_test_{}'.format(idx),
                                     log_name=log_name)

    assert candidates.__len__() > 0
コード例 #2
0
ファイル: test_extraction.py プロジェクト: edugonza/eddytools
def check_mm(openslex_file_path, connection_params, metadata):

    mm_engine = ex.create_mm_engine(openslex_file_path)
    mm_conn = mm_engine.connect()

    db_engine = ex.create_db_engine(**connection_params)
    db_conn = db_engine.connect()

    check = True

    for t in metadata.tables.keys():
        mm_q = sq.select([text('count(O.id) as num')]).select_from(text('object as O, class as CL')) \
            .where(text('O.class_id == CL.id and CL.name == "{}"'.format(t)))
        mm_res = mm_conn.execute(mm_q)
        mm_num = mm_res.first()
        mm_res.close()

        db_q = sq.select([text('count(*) as num')]).select_from(text(t))
        db_res = db_conn.execute(db_q)
        db_num = db_res.first()
        db_res.close()

        check_t = mm_num['num'] == db_num['num']
        check = check and check_t

        assert check

    mm_conn.close()
    mm_engine.dispose()

    db_conn.close()
    db_engine.dispose()
    print(check)

    return check
コード例 #3
0
def test_default_model(openslex=train_openslex_file_path,
                       ground_truth=ground_truth_path):
    mm_engine = ex.create_mm_engine(openslex)
    mm_meta = ex.get_mm_meta(mm_engine)
    disc = ev.discover_event_definitions(mm_engine, mm_meta, model='default')
    aid = disc['aid']
    y_true_in_table = aid.load_y_true(disc[ev.CT_IN_TABLE]['candidates'],
                                      ground_truth)
    y_true_lookup = aid.load_y_true(disc[ev.CT_LOOKUP]['candidates'],
                                    ground_truth)
    y_true_ts_fields = aid.load_y_true(disc[ev.CT_TS_FIELD]['candidates'],
                                       ground_truth)
    scores_ts_fields = aid.score(y_true_ts_fields,
                                 disc[ev.CT_TS_FIELD]['predicted'])
    scores_in_table = aid.score(y_true_in_table,
                                disc[ev.CT_IN_TABLE]['predicted'])
    scores_lookup = aid.score(y_true_lookup, disc[ev.CT_LOOKUP]['predicted'])
    print('Score Ts Fields')
    pprint(scores_ts_fields)

    print('Score In Table')
    pprint(scores_in_table)

    print('Score Lookup')
    pprint(scores_lookup)
コード例 #4
0
ファイル: __main__.py プロジェクト: edugonza/eddytools
def export_log(mm_path, log_id, output_file):
    mm_engine = ex.create_mm_engine(mm_path)
    mm_meta = ex.get_mm_meta(mm_engine)

    df: pd.DataFrame = cn.log_to_dataframe(mm_engine=mm_engine,
                                           mm_meta=mm_meta,
                                           log_id=log_id)
    df.to_csv(output_file, index_label='idx')
コード例 #5
0
ファイル: __main__.py プロジェクト: edugonza/eddytools
def print_cn(mm_path, cn_id, cn_dir, output_file, view):
    mm_engine = ex.create_mm_engine(mm_path)
    mm_meta = ex.get_mm_meta(mm_engine)
    try:
        candidates = pickle.load(open(Path(cn_dir, 'candidates.pkl'), 'rb'))
        case_notion = candidates[cn_id]
        dump_cn_dot(mm_engine, mm_meta, case_notion, output_file, view)
    except:
        raise Exception('No case notion to show')
コード例 #6
0
ファイル: __main__.py プロジェクト: edugonza/eddytools
def print_cn_log(mm_path, log_id, output_file, view):
    mm_engine = ex.create_mm_engine(mm_path)
    mm_meta = ex.get_mm_meta(mm_engine)
    info = cn.log_info(mm_engine, mm_meta, log_id)
    if 'case_notion' in info['attributes']:
        dump_cn_dot(mm_engine, mm_meta, info['attributes']['case_notion'],
                    output_file, view)
    else:
        raise Exception('No case notion to show')
コード例 #7
0
def test_candidates():
    mm_engine = ex.create_mm_engine(train_openslex_file_path)
    mm_meta = ex.get_mm_meta(mm_engine)

    os.makedirs(dumps_dir, exist_ok=True)

    classes = None

    model = ev.train_model(mm_engine=mm_engine,
                           mm_meta=mm_meta,
                           y_true_path=ground_truth_path,
                           classes=classes,
                           model_output=trained_model)

    disc = ev.discover_event_definitions(mm_engine=mm_engine,
                                         mm_meta=mm_meta,
                                         classes=classes,
                                         dump_dir=dumps_dir,
                                         model=model)

    shutil.copyfile(train_openslex_file_path, modified_mm_path)

    mm_engine_modif = ex.create_mm_engine(modified_mm_path)
    mm_meta_modif = ex.get_mm_meta(mm_engine_modif)

    pred_ts_cand = [
        c for p, c in zip(disc[ev.CT_TS_FIELD]['predicted'], disc[
            ev.CT_TS_FIELD]['candidates']) if p == 1
    ]
    pred_in_table_cand = [
        c for p, c in zip(disc[ev.CT_IN_TABLE]['predicted'], disc[
            ev.CT_IN_TABLE]['candidates']) if p == 1
    ]
    pred_lookup_cand = [
        c for p, c in zip(disc[ev.CT_LOOKUP]['predicted'], disc[ev.CT_LOOKUP]
                          ['candidates']) if p == 1
    ]

    ev.compute_events(mm_engine_modif, mm_meta_modif, pred_ts_cand[:2])
    ev.compute_events(mm_engine_modif, mm_meta_modif, pred_in_table_cand[:2])
    ev.compute_events(mm_engine_modif, mm_meta_modif, pred_lookup_cand[:2])
コード例 #8
0
ファイル: __main__.py プロジェクト: edugonza/eddytools
def disc_and_build(mm: Path, new_mm: Path, dump_dir: Path, build_events=False):
    print("Discovering and building events for: {}".format(mm))
    print("In: {}".format(new_mm))
    print("Dumping in: {}".format(dump_dir))

    mm_engine_train = ex.create_mm_engine(mm)
    mm_meta_train = ex.get_mm_meta(mm_engine_train)

    cached_dir_train = dump_dir

    ts_train_path = cached_dir_train / 'timestamps.json'
    candidates_ts_fields_path = cached_dir_train / 'candidates_ts_fields.json'
    candidates_in_table_path = cached_dir_train / 'candidates_in_table.json'
    candidates_lookup_path = cached_dir_train / 'candidates_lookup.json'
    features_in_table_path = cached_dir_train / 'features_in_table.json'
    features_lookup_path = cached_dir_train / 'features_lookup.json'
    final_candidates_ts_fields_path = cached_dir_train / 'final_candidates_ts_fields.json'
    final_candidates_in_table_path = cached_dir_train / 'final_candidates_in_table.json'
    final_candidates_lookup_path = cached_dir_train / 'final_candidates_lookup.json'

    os.makedirs(cached_dir_train, exist_ok=True)

    aid = ev.ActivityIdentifierDiscoverer(engine=mm_engine_train,
                                          meta=mm_meta_train,
                                          model='default')

    if os.path.exists(ts_train_path):
        timestamp_attrs = aid.load_timestamp_attributes(ts_train_path)
    else:
        timestamp_attrs = aid.get_timestamp_attributes()
        aid.save_timestamp_attributes(timestamp_attrs, ts_train_path)
    #

    if os.path.exists(candidates_ts_fields_path):
        candidates_ts_fields = aid.load_candidates(candidates_ts_fields_path)
    else:
        candidates_ts_fields = aid.generate_candidates(
            timestamp_attrs=timestamp_attrs, candidate_type=ev.CT_TS_FIELD)
        aid.save_candidates(candidates_ts_fields, candidates_ts_fields_path)

    if os.path.exists(candidates_in_table_path):
        candidates_in_table = aid.load_candidates(candidates_in_table_path)
    else:
        candidates_in_table = aid.generate_candidates(
            timestamp_attrs=timestamp_attrs, candidate_type=ev.CT_IN_TABLE)
        aid.save_candidates(candidates_in_table, candidates_in_table_path)

    if os.path.exists(candidates_lookup_path):
        candidates_lookup = aid.load_candidates(candidates_lookup_path)
    else:
        candidates_lookup = aid.generate_candidates(
            timestamp_attrs=timestamp_attrs, candidate_type=ev.CT_LOOKUP)
        aid.save_candidates(candidates_lookup, candidates_lookup_path)
    #

    if os.path.exists(features_in_table_path):
        X_in_table = aid.load_features(features_in_table_path)
    else:
        X_in_table = aid.compute_features(candidates_in_table, verbose=1)
        aid.save_features(X_in_table, features_in_table_path)

    if os.path.exists(features_lookup_path):
        X_lookup = aid.load_features(features_lookup_path)
    else:
        X_lookup = aid.compute_features(candidates_lookup, verbose=1)
        aid.save_features(X_lookup, features_lookup_path)
    #

    if os.path.exists(final_candidates_ts_fields_path):
        final_candidates_ts_fields = json.load(
            open(final_candidates_ts_fields_path, 'rt'))
    else:
        predicted_ts_fields = [1 for c in candidates_ts_fields]
        final_candidates_ts_fields = [
            c for p, c in zip(predicted_ts_fields, candidates_ts_fields)
            if p == 1
        ]
        json.dump(final_candidates_ts_fields,
                  open(final_candidates_ts_fields_path, 'wt'),
                  indent=True)

    if os.path.exists(final_candidates_in_table_path):
        final_candidates_in_table = json.load(
            open(final_candidates_in_table_path, 'rt'))
    else:
        predicted_in_table = aid.predict(X_in_table,
                                         candidate_type=ev.CT_IN_TABLE)
        final_candidates_in_table = [
            c for p, c in zip(predicted_in_table, candidates_in_table)
            if p == 1
        ]
        json.dump(final_candidates_in_table,
                  open(final_candidates_in_table_path, 'wt'),
                  indent=True)

    if os.path.exists(final_candidates_lookup_path):
        final_candidates_lookup = json.load(
            open(final_candidates_lookup_path, 'rt'))
    else:
        predicted_lookup = aid.predict(X_lookup, candidate_type=ev.CT_LOOKUP)
        final_candidates_lookup = [
            c for p, c in zip(predicted_lookup, candidates_lookup) if p == 1
        ]
        json.dump(final_candidates_lookup,
                  open(final_candidates_lookup_path, 'wt'),
                  indent=True)

    if build_events:

        shutil.copyfile(mm, new_mm)

        mm_engine_modif = ex.create_mm_engine(new_mm)
        mm_meta_modif = ex.get_mm_meta(mm_engine_modif)

        ev.compute_events(mm_engine_modif, mm_meta_modif,
                          final_candidates_ts_fields)
        ev.compute_events(mm_engine_modif, mm_meta_modif,
                          final_candidates_in_table)
        ev.compute_events(mm_engine_modif, mm_meta_modif,
                          final_candidates_lookup)
コード例 #9
0
ファイル: __main__.py プロジェクト: edugonza/eddytools
def info_log(mm_path, log_id):
    mm_engine = ex.create_mm_engine(mm_path)
    mm_meta = ex.get_mm_meta(mm_engine)
    info = cn.log_info(mm_engine, mm_meta, log_id)
    pprint(info)
コード例 #10
0
ファイル: __main__.py プロジェクト: edugonza/eddytools
def list_logs(mm_path):
    mm_engine = ex.create_mm_engine(mm_path)
    mm_meta = ex.get_mm_meta(mm_engine)
    logs = cn.list_logs(mm_engine, mm_meta)
    pprint(logs)
コード例 #11
0
ファイル: __main__.py プロジェクト: edugonza/eddytools
def case_notion_candidates_cached(mm_path,
                                  dump_dir,
                                  build_logs=False,
                                  topk=None):
    mm_engine = ex.create_mm_engine(mm_path)
    mm_meta = ex.get_mm_meta(mm_engine)

    os.makedirs(dump_dir, exist_ok=True)

    class_stats_path = Path(dump_dir, 'class_stats_mm.json')
    candidates_path = Path(dump_dir, 'candidates.pkl')
    bounds_path = Path(dump_dir, 'bounds.json')
    prediction_path = Path(dump_dir, 'prediction.json')
    params_path = Path(dump_dir, 'params.json')
    ranking_path = Path(dump_dir, 'ranking.json')
    mm_modif_path = Path(dump_dir, 'mm-modif-logs.slexmm')

    if os.path.isfile(class_stats_path):
        class_stats = json.load(
            open('{}/class_stats_mm.json'.format(dump_dir), 'rt'))
    else:
        class_stats = cn.get_stats_mm(mm_engine)
        json.dump(class_stats,
                  open('{}/class_stats_mm.json'.format(dump_dir), 'wt'),
                  indent=True)

    if os.path.isfile(candidates_path):
        candidates_mem = cn.load_candidates(candidates_path)
    else:
        candidates = cn.compute_candidates(mm_engine, cache_dir=dump_dir)
        candidates_mem = cn.save_candidates(candidates, candidates_path)

    if os.path.isfile(bounds_path):
        bounds = json.load(open(bounds_path, 'rt'))
    else:
        bounds = cn.compute_bounds_of_candidates(candidates_mem, mm_engine,
                                                 mm_meta, class_stats)
        json.dump(bounds, open(bounds_path, 'wt'), indent=True)

    if os.path.isfile(prediction_path):
        pred = json.load(open(prediction_path, 'rt'))
    else:
        pred = cn.compute_prediction_from_bounds(bounds, 0.5, 0.5, 0.5)
        json.dump(pred, open(prediction_path, 'wt'), indent=True)

    if os.path.isfile(params_path):
        params = json.load(open(params_path, 'rt'))
    else:
        params = {
            'mode_sp': None,
            'max_sp': None,
            'min_sp': None,
            'mode_lod': None,
            'max_lod': 10,
            'min_lod': 3,
            'mode_ae': None,
            'max_ae': 3000,
            'min_ae': 0,
            'w_sp': 0.33,
            'w_lod': 0.33,
            'w_ae': 0.33,
        }
        json.dump(params, open(params_path, 'wt'), indent=True)

    pprint(params)

    if os.path.isfile(ranking_path):
        detailed_ranking = pd.DataFrame.from_csv(open(ranking_path, 'rt'))
    else:
        detailed_ranking = cn.compute_detailed_ranking(pred, **params)
        detailed_ranking.to_csv(open(ranking_path, 'wt'))

    ranking = detailed_ranking['cn_id']
    pprint(detailed_ranking)

    if build_logs:

        shutil.copyfile(mm_path, mm_modif_path)

        mm_engine_modif = ex.create_mm_engine(mm_modif_path)
        mm_meta_modif = ex.get_mm_meta(mm_engine_modif)

        if topk:
            ranking_to_build = ranking[:topk]
        else:
            ranking_to_build = ranking

        for idx in ranking_to_build:
            c = candidates_mem[idx]
            proc_name = 'proc_{}'.format(idx)
            log_name = 'log_{}'.format(idx)
            print('Building Log: {}'.format(log_name))
            cn.build_log_for_case_notion(mm_engine_modif,
                                         c,
                                         proc_name=proc_name,
                                         log_name=log_name,
                                         metadata=mm_meta_modif)
コード例 #12
0
def test_candidates():

    shutil.copyfile(openslex_file_path_orig, openslex_file_path)

    mm_engine = ex.create_mm_engine(openslex_file_path)
    metadata = ex.get_mm_meta(mm_engine)

    rs = cn.get_relationships(mm_engine)
    assert rs.__len__() == 8
    for r in rs:
        assert r['rs'] in [
            'CONCERT_HALL_FK', 'SEAT_HALL_FK', 'BP_CONCERT_FK', 'BP_BAND_FK',
            'TICKET_CONCERT', 'TICKET_SEAT', 'BOOKING_CON', 'BOOKING_FK'
        ]

    class_stats = cn.get_stats_mm(mm_engine)

    os.makedirs('output/dumps/', exist_ok=True)

    json.dump(class_stats,
              open('output/dumps/stats_mm.json', 'wt'),
              indent=True)

    candidates = cn.compute_candidates(mm_engine)

    json.dump(candidates,
              open('output/dumps/candidates.json', 'wt'),
              indent=True)

    candidates = json.load(open('output/dumps/candidates.json', 'rt'),
                           object_hook=cn.CaseNotion)

    for idx, cand in enumerate(candidates):
        log_name = 'log_test_{}'.format(idx)
        print('Computing Log: {}'.format(log_name))
        log_id = cn.build_log_for_case_notion(
            mm_engine,
            cand,
            proc_name='proc_test_{}'.format(idx),
            log_name=log_name,
            metadata=metadata)

        sp = cn.compute_support_log(mm_engine, log_id, metadata)
        print('Support for {}: {}'.format(log_id, sp))
        ae = cn.compute_ae_log(mm_engine, log_id, metadata, sp)
        print('AE for {}: {}'.format(log_id, ae))
        lod = cn.compute_lod_log(mm_engine, log_id, metadata, sp)
        print('LoD for {}: {}'.format(log_id, lod))

        sp_lb = cn.compute_lb_support_cn(mm_engine, cand, metadata,
                                         class_stats)
        print('Lower Bound of Support for {}: {}'.format(idx, sp_lb))
        sp_ub = cn.compute_ub_support_cn(mm_engine, cand, metadata,
                                         class_stats)
        print('Upper Bound of Support for {}: {}'.format(idx, sp_ub))

        lod_lb = cn.compute_lb_lod_cn(mm_engine, cand, metadata, class_stats)
        print('Lower Bound of LoD for {}: {}'.format(idx, lod_lb))
        lod_ub = cn.compute_ub_lod_cn(mm_engine, cand, metadata, class_stats)
        print('Upper Bound of LoD for {}: {}'.format(idx, lod_ub))

        ae_lb = cn.compute_lb_ae_cn(mm_engine, cand, metadata, class_stats)
        print('Lower Bound of AE for {}: {}'.format(idx, ae_lb))
        ae_ub = cn.compute_ub_ae_cn(mm_engine, cand, metadata, class_stats)
        print('Upper Bound of AE for {}: {}'.format(idx, ae_ub))

        assert sp_ub >= sp >= sp_lb
        assert lod_ub >= lod >= lod_lb
        assert ae_ub >= ae >= ae_lb

    assert candidates.__len__() > 0
コード例 #13
0
def evaluate_cn(mm_filepath: str, output_dir):

    os.makedirs(output_dir, exist_ok=True)

    mm_filepath_tmp = '{}/mm-modif.slexmm'.format(output_dir)
    df_filepath = '{}/df.json'.format(output_dir)
    df_pickle_file = '{}/df.pickle'.format(output_dir)
    class_stats_filepath = '{}/class_stats.json'.format(output_dir)
    cands_filepath = '{}/candidates.json'.format(output_dir)

    shutil.copyfile(mm_filepath, mm_filepath_tmp)

    mm_engine = edex.create_mm_engine(mm_filepath_tmp)
    mm_meta = edex.get_mm_meta(mm_engine)

    cands = edcn.compute_candidates(mm_engine=mm_engine, max_length_path=5, cache_dir=output_dir)
    stats = {
        'proc_name': [],
        'log_name': [],
        'log_id': [],
        'log_sp': [],
        'log_lod': [],
        'log_ae': [],
        'cn_sp_lb': [],
        'cn_sp_ub': [],
        'cn_lod_lb': [],
        'cn_lod_ub': [],
        'cn_ae_lb': [],
        'cn_ae_ub': [],
        'num_classes': [],
        'e': [],
        'ir': [],
    }

    class_stats = edcn.get_stats_mm(mm_engine, mm_meta)

    json.dump(class_stats, open(class_stats_filepath, mode='wt'), indent=True)

    cn: CaseNotion
    for i, cn in enumerate(tqdm(cands.values(), total=len(cands), desc='Candidates')):
        proc_name = 'proc-{}'.format(i)
        log_name = 'log-{}'.format(i)
        log_id = edcn.build_log_for_case_notion(
            mm_engine, cn, proc_name, log_name, mm_meta)
        log_sp = edcn.compute_support_log(mm_engine, log_id, mm_meta)
        log_lod = edcn.compute_lod_log(mm_engine, log_id, mm_meta)
        log_ae = edcn.compute_ae_log(mm_engine, log_id, mm_meta)
        cn_sp_lb = edcn.compute_lb_support_cn(mm_engine, cn, mm_meta, class_stats)
        cn_sp_ub = edcn.compute_ub_support_cn(mm_engine, cn, mm_meta, class_stats)
        cn_lod_lb = edcn.compute_lb_lod_cn(mm_engine, cn, mm_meta, class_stats)
        cn_lod_ub = edcn.compute_ub_lod_cn(mm_engine, cn, mm_meta, class_stats)
        cn_ae_lb = edcn.compute_lb_ae_cn(mm_engine, cn, mm_meta, class_stats)
        cn_ae_ub = edcn.compute_ub_ae_cn(mm_engine, cn, mm_meta, class_stats)
        stats['proc_name'].append(proc_name)
        stats['log_name'].append(log_name)
        stats['log_id'].append(log_id)
        stats['log_sp'].append(log_sp)
        stats['log_lod'].append(log_lod)
        stats['log_ae'].append(log_ae)
        stats['cn_sp_lb'].append(cn_sp_lb)
        stats['cn_sp_ub'].append(cn_sp_ub)
        stats['cn_lod_lb'].append(cn_lod_lb)
        stats['cn_lod_ub'].append(cn_lod_ub)
        stats['cn_ae_lb'].append(cn_ae_lb)
        stats['cn_ae_ub'].append(cn_ae_ub)
        stats['num_classes'].append(cn.get_classes_ids().__len__())
        num_e = 0
        sum_e_per_o = 0
        for c_id in cn.get_classes_ids():
            num_e = num_e + class_stats[str(c_id)]['e']
            e_per_o = class_stats[str(c_id)]['ev_o']
            num_o = class_stats[str(c_id)]['o_w_ev']
            sum_e_per_o = float(sum_e_per_o) + float(float(e_per_o) / float(max(num_o, 1)))
        stats['e'].append(num_e)
        stats['ir'].append(sum_e_per_o / cn.get_classes_ids().__len__())

    df = pd.DataFrame(stats)

    df.to_json(df_filepath)
    pickle.dump(df, open(df_pickle_file, mode='wb'))
コード例 #14
0
def test_candidates_cached():
    mm_engine_train = ex.create_mm_engine(train_openslex_file_path)
    mm_meta_train = ex.get_mm_meta(mm_engine_train)

    cached_dir_train = 'output/adw/ev_disc'

    ts_train_path = '{}/timestamps.json'.format(cached_dir_train)
    candidates_ts_fields_path = '{}/candidates_ts_fields.json'.format(
        cached_dir_train)
    candidates_in_table_path = '{}/candidates_in_table.json'.format(
        cached_dir_train)
    candidates_lookup_path = '{}/candidates_lookup.json'.format(
        cached_dir_train)
    features_in_table_path = '{}/features_in_table.json'.format(
        cached_dir_train)
    features_lookup_path = '{}/features_lookup.json'.format(cached_dir_train)

    os.makedirs(cached_dir_train, exist_ok=True)

    aid = ev.ActivityIdentifierDiscoverer(engine=mm_engine_train,
                                          meta=mm_meta_train,
                                          model='default')

    if os.path.exists(ts_train_path):
        timestamp_attrs = aid.load_timestamp_attributes(ts_train_path)
    else:
        timestamp_attrs = aid.get_timestamp_attributes()
        aid.save_timestamp_attributes(timestamp_attrs, ts_train_path)
    #

    if os.path.exists(candidates_ts_fields_path):
        candidates_ts_fields = aid.load_candidates(candidates_ts_fields_path)
    else:
        candidates_ts_fields = aid.generate_candidates(
            timestamp_attrs=timestamp_attrs, candidate_type=ev.CT_TS_FIELD)
        aid.save_candidates(candidates_ts_fields, candidates_ts_fields_path)

    if os.path.exists(candidates_in_table_path):
        candidates_in_table = aid.load_candidates(candidates_in_table_path)
    else:
        candidates_in_table = aid.generate_candidates(
            timestamp_attrs=timestamp_attrs, candidate_type=ev.CT_IN_TABLE)
        aid.save_candidates(candidates_in_table, candidates_in_table_path)

    if os.path.exists(candidates_lookup_path):
        candidates_lookup = aid.load_candidates(candidates_lookup_path)
    else:
        candidates_lookup = aid.generate_candidates(
            timestamp_attrs=timestamp_attrs, candidate_type=ev.CT_LOOKUP)
        aid.save_candidates(candidates_lookup, candidates_lookup_path)
    #

    if os.path.exists(features_in_table_path):
        X_in_table = aid.load_features(features_in_table_path)
    else:
        X_in_table = aid.compute_features(candidates_in_table, verbose=1)
        aid.save_features(X_in_table, features_in_table_path)

    if os.path.exists(features_lookup_path):
        X_lookup = aid.load_features(features_lookup_path)
    else:
        X_lookup = aid.compute_features(candidates_lookup, verbose=1)
        aid.save_features(X_lookup, features_lookup_path)
    #

    predicted_ts_fields = [1 for c in candidates_ts_fields]
    predicted_in_table = aid.predict(X_in_table, candidate_type=ev.CT_IN_TABLE)
    predicted_lookup = aid.predict(X_lookup, candidate_type=ev.CT_LOOKUP)

    shutil.copyfile(train_openslex_file_path, modified_mm_path)

    mm_engine_modif = ex.create_mm_engine(modified_mm_path)
    mm_meta_modif = ex.get_mm_meta(mm_engine_modif)

    pred_ts_cand = [
        c for p, c in zip(predicted_ts_fields, candidates_ts_fields) if p == 1
    ]
    pred_in_table_cand = [
        c for p, c in zip(predicted_in_table, candidates_in_table) if p == 1
    ]
    pred_lookup_cand = [
        c for p, c in zip(predicted_lookup, candidates_lookup) if p == 1
    ]

    ev.compute_events(mm_engine_modif, mm_meta_modif, pred_ts_cand[:2])
    ev.compute_events(mm_engine_modif, mm_meta_modif, pred_in_table_cand[:2])
    ev.compute_events(mm_engine_modif, mm_meta_modif, pred_lookup_cand[:2])
コード例 #15
0
def test_trained_model_cached(openslex_train=train_openslex_file_path,
                              ground_truth_train=ground_truth_path,
                              openslex_test=train_openslex_file_path,
                              ground_truth_test=ground_truth_path,
                              cached_dir_train='output/A/ev_disc',
                              cached_dir_test='output/B/ev_disc'):

    ts_train_path = '{}/timestamps.json'.format(cached_dir_train)
    candidates_ts_fields_path = '{}/candidates_ts_fields.json'.format(
        cached_dir_train)
    candidates_in_table_path = '{}/candidates_in_table.json'.format(
        cached_dir_train)
    candidates_lookup_path = '{}/candidates_lookup.json'.format(
        cached_dir_train)
    features_in_table_path = '{}/features_in_table.json'.format(
        cached_dir_train)
    features_lookup_path = '{}/features_lookup.json'.format(cached_dir_train)
    model_trained_path = '{}/model.pkl'.format(cached_dir_train)

    ts_test_path = '{}/timestamps.json'.format(cached_dir_test)
    candidates_test_ts_fields_path = '{}/candidates_ts_fields.json'.format(
        cached_dir_test)
    candidates_test_in_table_path = '{}/candidates_in_table.json'.format(
        cached_dir_test)
    candidates_test_lookup_path = '{}/candidates_lookup.json'.format(
        cached_dir_test)
    features_test_in_table_path = '{}/features_in_table.json'.format(
        cached_dir_test)
    features_test_lookup_path = '{}/features_lookup.json'.format(
        cached_dir_test)

    if not os.path.exists(cached_dir_test):
        os.makedirs(cached_dir_test)
    if not os.path.exists(cached_dir_train):
        os.makedirs(cached_dir_train)

    mm_engine_train = ex.create_mm_engine(openslex_train)
    mm_meta_train = ex.get_mm_meta(mm_engine_train)

    aid = ev.ActivityIdentifierDiscoverer(engine=mm_engine_train,
                                          meta=mm_meta_train,
                                          model=None)

    if os.path.exists(ts_train_path):
        timestamp_attrs = aid.load_timestamp_attributes(ts_train_path)
    else:
        timestamp_attrs = aid.get_timestamp_attributes()
        aid.save_timestamp_attributes(timestamp_attrs, ts_train_path)
    #

    if os.path.exists(candidates_ts_fields_path):
        candidates_ts_fields = aid.load_candidates(candidates_ts_fields_path)
    else:
        candidates_ts_fields = aid.generate_candidates(
            timestamp_attrs=timestamp_attrs, candidate_type=ev.CT_TS_FIELD)
        aid.save_candidates(candidates_ts_fields, candidates_ts_fields_path)

    if os.path.exists(candidates_in_table_path):
        candidates_in_table = aid.load_candidates(candidates_in_table_path)
    else:
        candidates_in_table = aid.generate_candidates(
            timestamp_attrs=timestamp_attrs, candidate_type=ev.CT_IN_TABLE)
        aid.save_candidates(candidates_in_table, candidates_in_table_path)

    if os.path.exists(candidates_lookup_path):
        candidates_lookup = aid.load_candidates(candidates_lookup_path)
    else:
        candidates_lookup = aid.generate_candidates(
            timestamp_attrs=timestamp_attrs, candidate_type=ev.CT_LOOKUP)
        aid.save_candidates(candidates_lookup, candidates_lookup_path)
    #

    if os.path.exists(features_in_table_path):
        X_in_table = aid.load_features(features_in_table_path)
    else:
        X_in_table = aid.compute_features(candidates_in_table, verbose=1)
        aid.save_features(X_in_table, features_in_table_path)

    if os.path.exists(features_lookup_path):
        X_lookup = aid.load_features(features_lookup_path)
    else:
        X_lookup = aid.compute_features(candidates_lookup, verbose=1)
        aid.save_features(X_lookup, features_lookup_path)
    #

    y_true_train_in_table = aid.load_y_true(candidates_in_table,
                                            y_true_path=ground_truth_train)
    y_true_train_lookup = aid.load_y_true(candidates_lookup,
                                          y_true_path=ground_truth_train)

    try:
        class_weight_in_table = compute_class_weight('balanced', [0, 1],
                                                     y_true_train_in_table)
    except:
        class_weight_in_table = [1.0, 1.0]
    try:
        class_weight_lookup = compute_class_weight('balanced', [0, 1],
                                                   y_true_train_lookup)
    except:
        class_weight_lookup = [1.0, 1.0]

    classifiers = {
        ev.CT_IN_TABLE:
        ev.make_sklearn_pipeline(
            XGBClassifier(max_depth=2,
                          n_estimators=10,
                          random_state=1,
                          scale_pos_weight=class_weight_in_table[1])),
        ev.CT_LOOKUP:
        ev.make_sklearn_pipeline(
            XGBClassifier(max_depth=2,
                          n_estimators=10,
                          random_state=1,
                          scale_pos_weight=class_weight_lookup[1]))
    }

    aid.set_model(classifiers)

    aid.train_model(X_in_table,
                    y_true_train_in_table,
                    candidate_type=ev.CT_IN_TABLE)
    aid.train_model(X_lookup, y_true_train_lookup, candidate_type=ev.CT_LOOKUP)

    y_pred_ts_fields = [1 for c in candidates_ts_fields]
    y_pred_in_table = aid.predict(X_in_table, candidate_type=ev.CT_IN_TABLE)
    y_pred_lookup = aid.predict(X_lookup, candidate_type=ev.CT_LOOKUP)

    scores_train_in_table = aid.score(y_true_train_in_table, y_pred_in_table)
    scores_train_lookup = aid.score(y_true_train_lookup, y_pred_lookup)

    print('Scores In Table')
    pprint(scores_train_in_table)
    print('Scores Lookup')
    pprint(scores_train_lookup)

    model_trained = classifiers

    with open(model_trained_path, mode='wb') as f:
        pickle.dump(classifiers, f)

    mm_engine_test = ex.create_mm_engine(openslex_test)
    mm_meta_test = ex.get_mm_meta(mm_engine_test)

    aid_test = ev.ActivityIdentifierDiscoverer(engine=mm_engine_test,
                                               meta=mm_meta_test,
                                               model=model_trained)

    if os.path.exists(ts_test_path):
        timestamp_attrs_test = aid_test.load_timestamp_attributes(ts_test_path)
    else:
        timestamp_attrs_test = aid_test.get_timestamp_attributes()
        aid_test.save_timestamp_attributes(timestamp_attrs_test, ts_test_path)
    #

    if os.path.exists(candidates_test_ts_fields_path):
        candidates_test_ts_fields = aid_test.load_candidates(
            candidates_test_ts_fields_path)
    else:
        candidates_test_ts_fields = aid_test.generate_candidates(
            timestamp_attrs=timestamp_attrs_test,
            candidate_type=ev.CT_IN_TABLE)
        aid_test.save_candidates(candidates_test_ts_fields,
                                 candidates_test_ts_fields_path)

    if os.path.exists(candidates_test_in_table_path):
        candidates_test_in_table = aid_test.load_candidates(
            candidates_test_in_table_path)
    else:
        candidates_test_in_table = aid_test.generate_candidates(
            timestamp_attrs=timestamp_attrs_test,
            candidate_type=ev.CT_IN_TABLE)
        aid_test.save_candidates(candidates_test_in_table,
                                 candidates_test_in_table_path)

    if os.path.exists(candidates_test_lookup_path):
        candidates_test_lookup = aid_test.load_candidates(
            candidates_test_lookup_path)
    else:
        candidates_test_lookup = aid_test.generate_candidates(
            timestamp_attrs=timestamp_attrs_test, candidate_type=ev.CT_LOOKUP)
        aid_test.save_candidates(candidates_test_lookup,
                                 candidates_test_lookup_path)
    #

    if os.path.exists(features_test_in_table_path):
        feature_values_in_table_test = aid_test.load_features(
            features_test_in_table_path)
    else:
        feature_values_in_table_test = aid_test.compute_features(
            candidates_test_in_table, verbose=True)
        aid_test.save_features(feature_values_in_table_test,
                               features_test_in_table_path)

    if os.path.exists(features_test_lookup_path):
        feature_values_lookup_test = aid_test.load_features(
            features_test_lookup_path)
    else:
        feature_values_lookup_test = aid_test.compute_features(
            candidates_test_lookup, verbose=True)
        aid_test.save_features(feature_values_lookup_test,
                               features_test_lookup_path)
    #

    pred_test_ts_fields = [1 for c in candidates_test_ts_fields]
    pred_test_in_table = aid_test.predict(feature_values_in_table_test,
                                          candidate_type=ev.CT_IN_TABLE)
    pred_test_lookup = aid_test.predict(feature_values_lookup_test,
                                        candidate_type=ev.CT_LOOKUP)

    y_true_in_table = aid_test.load_y_true(candidates_test_in_table,
                                           ground_truth_test)
    y_true_lookup = aid_test.load_y_true(candidates_test_lookup,
                                         ground_truth_test)
    y_true_ts_fields = aid_test.load_y_true(candidates_test_ts_fields,
                                            ground_truth_test)
    scores_ts_fields = aid_test.score(y_true_ts_fields, pred_test_ts_fields)
    scores_in_table = aid_test.score(y_true_in_table, pred_test_in_table)
    scores_lookup = aid_test.score(y_true_lookup, pred_test_lookup)
    print('Score Ts Fields')
    pprint(scores_ts_fields)

    print('Score In Table')
    pprint(scores_in_table)

    print('Score Lookup')
    pprint(scores_lookup)