Exemple #1
0
    def test_get_encoded_logs_Loaded_cache(self):
        job = create_test_job()

        w_cache = get_encoded_logs(job, True)

        cached_loaded_log = LoadedLog.objects.filter(split=job.split)[0]

        cached_train = cached_loaded_log.train_log_path
        cached_test = cached_loaded_log.test_log_path

        os.remove('cache/loaded_log_cache/' + get_digested(cached_train) +
                  '.pickle')

        loaded_from_cache = get_encoded_logs(job, True)

        assert_frame_equal(w_cache[0], loaded_from_cache[0])
        assert_frame_equal(w_cache[1], loaded_from_cache[1])

        os.remove('cache/loaded_log_cache/' + get_digested(cached_test) +
                  '.pickle')

        loaded_from_cache = get_encoded_logs(job, True)

        assert_frame_equal(w_cache[0], loaded_from_cache[0])
        assert_frame_equal(w_cache[1], loaded_from_cache[1])
Exemple #2
0
    def test_get_encoded_logs_cache(self):
        job = create_test_job()

        w_cache = get_encoded_logs(job, True)
        wout_cache = get_encoded_logs(job, False)

        assert_frame_equal(w_cache[0], wout_cache[0])
        assert_frame_equal(w_cache[1], wout_cache[1])

        loaded_from_cache = get_encoded_logs(job, True)

        assert_frame_equal(w_cache[0], loaded_from_cache[0])
        assert_frame_equal(w_cache[1], loaded_from_cache[1])
    def handle(self, *args, **kwargs):
        # get model
        TARGET_MODEL = 5
        job = Job.objects.filter(pk=TARGET_MODEL)[0]
        model = joblib.load(job.predictive_model.model_path)
        model = model[0]
        training_df, test_df = get_encoded_logs(job)
        feature_names = list(
            training_df.drop(['trace_id', 'label'], 1).columns.values)

        X_train = training_df.drop(['trace_id', 'label'], 1)
        Y_train = training_df.drop(
            ['trace_id', 'prefix_1', 'prefix_3', 'prefix_4', 'label'], 1)

        rf = RuleFit()
        columns = list(X_train.columns)

        X = X_train.as_matrix()

        rf.fit(X, Y_train.values.ravel(), feature_names=columns)
        rules = rf.get_rules()
        # rules = rules[rules.coef != 0].sort_values("support", ascending=False)
        rules = rules[(rules.coef > 0.) & (rules.type != 'linear')]
        rules['effect'] = rules['coef'] * rules['support']
        pd.set_option('display.max_colwidth', -1)
        rules.nlargest(10, 'effect')
        # print(rules)
        rules
    def handle(self, *args, **kwargs):
        plt.style.use('ggplot')
        plt.figure(figsize=(6, 6))
        TARGET_MODEL = 59
        job = Job.objects.filter(pk=TARGET_MODEL)[0]

        training_df, test_df = get_encoded_logs(job)

        X_train = training_df.drop(['trace_id', 'label'], 1)
        RF = DecisionTreeClassifier()

        Y_train = training_df['label'].values
        RF.fit(X_train, Y_train)

        importancies, _ = audit_model(RF.predict, X_train)
        importancies
        print(importancies)

        # generate feature dependence plot
        fig = plot_dependencies(
            importancies.median(),
            reverse_values=False,
            title="FairML feature dependence plot"
        )

        file_name = "fairml_plot_train_1_3_decision_tree.png"
        plt.savefig(file_name, transparent=False, bbox_inches='tight', dpi=550)
    def handle(self, *args, **kwargs):

        TARGET_MODEL = 68
        job = Job.objects.filter(pk=TARGET_MODEL)[0]
        model = joblib.load(job.predictive_model.model_path)
        model = model[0]
        training_df, test_df = get_encoded_logs(job)

        EXPLANATION_TARGET = 2_3300
        FEATURE_TARGET = 1
        shap.initjs()

        explainer = shap.TreeExplainer(model)
        training_df = training_df.drop(['trace_id', 'label'], 1)

        shap_values = explainer.shap_values(training_df)

        encoder = retrieve_proper_encoder(job)
        encoder.decode(training_df, job.encoding)

        shap.force_plot(explainer.expected_value,
                        shap_values[EXPLANATION_TARGET, :],
                        training_df.iloc[EXPLANATION_TARGET, :],
                        show=False,
                        matplotlib=True).savefig('shap_plot_train_1_3.png')
Exemple #6
0
def get_decoded_df(request, pk):
    job = Job.objects.filter(pk=pk)[0]
    training_df, test_df = get_encoded_logs(job)
    training_df = training_df.drop(['trace_id'], 1)
    encoder = retrieve_proper_encoder(job)
    encoder.decode(training_df, job.encoding)
    return Response(training_df, status=200)
Exemple #7
0
    def test_get_labelled_logs(self):
        job = create_test_job()
        labelled_logs = get_encoded_logs(job)

        cached_labelled_logs = get_labelled_logs(job)

        assert_frame_equal(labelled_logs[0], cached_labelled_logs[0])
        assert_frame_equal(labelled_logs[1], cached_labelled_logs[1])
Exemple #8
0
    def handle(self, *args, **kwargs):
        # get model
        TARGET_MODEL = 59
        job = Job.objects.filter(pk=TARGET_MODEL)[0]
        model = joblib.load(job.predictive_model.model_path)[0]
        # load data
        training_df, test_df = get_encoded_logs(job)
        training_df['label'] = training_df['label'].astype(bool).astype(int)
        columns = list(training_df.columns.values)
        features = list(
            training_df.drop(['trace_id', 'label'], 1).columns.values)
        feature = 'Age_1'
        feature_grids, percentile_info = _get_grids(
            feature_values=training_df[feature].values,
            num_grid_points=10,
            grid_type=None,
            percentile_range='percentile',
            grid_range=None)
        custom_grids = []
        indexs = []
        for x in range(int(feature_grids.min()), int(feature_grids.max() - 1)):
            custom_grids.append(x)
        print(features)
        fig, axes, summary_df = info_plots.target_plot(
            df=training_df,
            feature=feature,
            feature_name='feature value',
            cust_grid_points=custom_grids,
            target='label',
            show_percentile=False)
        fig.savefig('ice_plot_train_1_3_CType.png')

        lists = list(training_df[feature].values)
        for x in range(int(feature_grids.min()), int(feature_grids.max() - 1)):
            indexs.append(lists.index(x))
        encoder = retrieve_proper_encoder(job)
        encoder.decode(training_df, job.encoding)
        values = training_df[feature].values
        training_df
        lst = []
        print(summary_df)
        if job.encoding.value_encoding != ValueEncodings.BOOLEAN.value:
            for x in range(len(indexs) - 1):
                lst.append({
                    'value': values[indexs[x]],
                    'label': summary_df['label'][x],
                    'count': summary_df['count'][x],
                })
        else:
            for x in range(summary_df.shape[0]):
                lst.append({
                    'value': summary_df['display_column'][x],
                    'label': summary_df['label'][x],
                    'count': summary_df['count'][x],
                })
        print(lst)
Exemple #9
0
def calculate_hyperopt(job: Job) -> (dict, dict, dict):
    """main entry method for hyperopt calculations
    returns the predictive_model for the best trial

    :param job: job configuration
    :return: tuple containing the results, config and predictive_model split from the search
    """

    logger.info("Start hyperopt job {} with {}, performance_metric {}".format(
        job.type, get_run(job),
        job.hyperparameter_optimizer.__getattribute__(
            job.hyperparameter_optimizer.optimization_method.lower()).
        performance_metric)  #Todo: WHY DO I NEED TO GET HYPEROPT?
                )

    global training_df, test_df, global_job
    global_job = job
    training_df, test_df = get_encoded_logs(job)

    space = _get_space(job)

    max_evaluations = job.hyperparameter_optimizer.__getattribute__(
        job.hyperparameter_optimizer.optimization_method.lower(
        )).max_evaluations  #Todo: WHY DO I NEED TO GET HYPEROPT?
    trials = Trials()

    algorithm = _choose_algorithm(job)

    try:
        fmin(_calculate_and_evaluate,
             space,
             algo=algorithm.suggest,
             max_evals=max_evaluations,
             trials=trials)
    except ValueError:
        raise ValueError("All jobs failed, cannot find best configuration")
    current_best = {
        'loss': 100,
        'results': {},
        'predictive_model_id': {},
        'model_split': {},
        'config': {}
    }
    for trial in trials:
        a = trial['result']
        if current_best['loss'] > a['loss']:
            current_best = a

    job.predictive_model = PredictiveModel.objects.filter(
        pk=current_best['predictive_model_id'])[0]
    job.save()

    logger.info("End hyperopt job {}, {} . Results {}".format(
        job.type, get_run(job), current_best['results']))
    return current_best['results'], current_best['config'], current_best[
        'model_split']
Exemple #10
0
def explanation(exp_id: int, explanation_target: str = None):
    exp = Explanation.objects.filter(pk=exp_id)[0]
    job = exp.job
    # load data
    training_df, test_df = get_encoded_logs(job)

    result = EXPLANATION[exp.type][EXPLAIN](exp, training_df, test_df,
                                            explanation_target)

    return 'False', result
Exemple #11
0
def explanation_temporal_stability(exp_id: int,
                                   explanation_target: str = None):
    exp = Explanation.objects.filter(pk=exp_id)[0]
    job = exp.job
    # load data
    training_df, test_df = get_encoded_logs(job)

    result = EXPLANATION[exp.type][TEMPORAL_STABILITY](exp, training_df,
                                                       test_df,
                                                       explanation_target)

    return 'False', result
    def handle(self, *args, **kwargs):
        # get model
        TARGET_MODEL = 20
        job = Job.objects.filter(pk=TARGET_MODEL)[0]
        model = joblib.load(job.predictive_model.model_path)
        model = model[0]

        # load data
        training_df, test_df = get_encoded_logs(job)

        # get radom point in evaluation set
        EXPLANATION_TARGET = 1

        # get the actual explanation
        job.encoding.features.remove('label')
        explainer = anchor_tabular.AnchorTabularExplainer(
            class_names=[True, False],
            feature_names=job.encoding.features,
            data=training_df.drop(['trace_id', 'label'], 1).T,
            categorical_names={
                job.encoding.features.index(item): list(range(max(training_df[item])))
                for item in job.encoding.features
            }
        )
        explainer.fit(
            training_df.drop(['trace_id', 'label'], 1).as_matrix(),
            [True, False],
            test_df.drop(['trace_id', 'label'], 1).as_matrix(),
            [True, False]
        )

        model_fn = lambda x: model.predict(x)

        # show plot
        idx = 0
        np.random.seed(1)
        print('Prediction: ', explainer.class_names[model_fn(test_df.drop(['trace_id', 'label'], 1).as_matrix()[idx].reshape(1, -1))[0]])
        exp = explainer.explain_instance(test_df.drop(['trace_id', 'label'], 1).as_matrix()[idx], model_fn, threshold=0.95)
        print('Anchor: %s' % (' AND '.join(exp.names())))
        print('Precision: %.2f' % exp.precision())
        print('Coverage: %.2f' % exp.coverage())

        fit_anchor = np.where(np.all(test_df.drop(['trace_id', 'label'], 1)[:, exp.features()] == test_df.drop(['trace_id', 'label'], 1).as_matrix()[idx][exp.features()], axis=1))[0]
        print('Anchor test coverage: %.2f' % (fit_anchor.shape[0] / float(test_df.drop(['trace_id', 'label'], 1).shape[0])))
        # print('Anchor test precision: %.2f' % (
        #     np.mean(predict_fn(test_df.drop(['trace_id', 'label'], 1)[fit_anchor]) == predict_fn(test_df.drop(['trace_id', 'label'], 1).as_matrix()[idx].reshape(1, -1))))
        #     np.mean(predict_fn(test_df.drop(['trace_id', 'label'], 1)[fit_anchor]) == predict_fn(test_df.drop(['trace_id', 'label'], 1).as_matrix()[idx].reshape(1, -1))))
        #       )

        print('done')
Exemple #13
0
    def handle(self, *args, **kwargs):
        # get model
        TARGET_MODEL = 71
        job = Job.objects.filter(pk=TARGET_MODEL)[0]
        model = joblib.load(job.predictive_model.model_path)[0]
        # load data
        training_df, test_df = get_encoded_logs(job)

        features = list(
            training_df.drop(['trace_id', 'label'], 1).columns.values)
        interpreter = Interpretation(training_df, feature_names=features)
        X_train = training_df.drop(['trace_id', 'label'], 1)
        Y_train = training_df['label'].values

        model_inst = InMemoryModel(model.predict,
                                   examples=X_train,
                                   model_type='classifier',
                                   unique_values=[1, 2],
                                   feature_names=features,
                                   target_names=['label'])
        surrogate_explainer = interpreter.tree_surrogate(model_inst, seed=5)

        surrogate_explainer.fit(X_train,
                                Y_train,
                                use_oracle=True,
                                prune='post',
                                scorer_type='default')
        surrogate_explainer.class_names = features

        viz = dtreeviz(surrogate_explainer.estimator_,
                       X_train,
                       Y_train,
                       target_name='label',
                       feature_names=features,
                       orientation="TD",
                       class_names=list(surrogate_explainer.class_names),
                       fancy=True,
                       X=None,
                       label_fontsize=12,
                       ticks_fontsize=8,
                       fontname="Arial")
        viz.save("skater_plot_train_2_2.svg")
Exemple #14
0
    def handle(self, *args, **kwargs):

        #get model
        TARGET_MODEL = 5
        job = Job.objects.filter(pk=TARGET_MODEL)[0]
        model = joblib.load(job.predictive_model.model_path)

        #load data
        training_df, test_df = get_encoded_logs(job)

        #get radom point in evaluation set
        EXPLANATION_TARGET = 3
        #get the actual explanation
        explainer = lime.lime_tabular.LimeTabularExplainer(
            training_df.drop(['trace_id', 'label'], 1).as_matrix(),
            feature_names=list(
                training_df.drop(['trace_id', 'label'], 1).columns.values),
            categorical_features=[
                i for i in range(
                    len(
                        list(
                            training_df.drop(['trace_id', 'label'],
                                             1).columns.values)))
            ],
            verbose=True,
            mode='classification',
        )
        exp = explainer.explain_instance(
            test_df.drop(['trace_id', 'label'], 1).iloc[
                EXPLANATION_TARGET],  #TODO probably the opposite would be way less computationally intesive
            model[0].predict_proba,
            num_features=5)
        exp.as_list()

        #show plot
        #exp.show_in_notebook(show_table=True)
        # exp.as_pyplot_figure().show()
        exp.save_to_file('oi.html')

        print('done')
    def handle(self, *args, **kwargs):
        TARGET_JOB = 71
        SPLITID = 12
        job_obj = Job.objects.filter(pk=TARGET_JOB)[0]
        split_obj = Split.objects.filter(pk=SPLITID)[0]

        training_df, test_df = get_encoded_logs(job_obj)

        test_df1 = test_df.copy()
        test_df2 = test_df.copy()
        test_df3 = test_df.copy()

        # todo: retrieve lime explanation

        # RETRIEVE&SAVE TS
        ts_exp_job, _ = Explanation.objects.get_or_create(
            type=ExplanationTypes.TEMPORAL_STABILITY.value,
            split=split_obj,
            predictive_model=job_obj.predictive_model,
            job=job_obj)
        ts = temporal_stability(ts_exp_job,
                                training_df,
                                test_df1,
                                explanation_target=None)

        # RETRIEVE&SAVE LIMETS
        limets_exp_job, _ = Explanation.objects.get_or_create(
            type=ExplanationTypes.LIME.value,
            split=split_obj,
            predictive_model=job_obj.predictive_model,
            job=job_obj)
        lime_ts = lime_temporal_stability(limets_exp_job,
                                          training_df,
                                          test_df2,
                                          explanation_target=None)

        # SAVE GOLD
        gold = test_df3[['trace_id', 'label']]

        # todo: retrieve confusion matrix

        ts = {
            asdf: {
                uuu + '1' if uuu[-1:] == '_' else uuu: ts[asdf][uuu]
                for uuu in ts[asdf]
            }
            for asdf in ts
        }
        lime_ts = {
            asdf: {
                uuu + '1' if uuu[-1:] == '_' else uuu: lime_ts[asdf][uuu]
                for uuu in lime_ts[asdf]
            }
            for asdf in lime_ts
        }

        trace_ids = set(gold['trace_id'])
        confusion_matrix = {
            'tp': [
                str(tid) for tid in trace_ids
                if str(tid) in ts and ts[str(tid)]['prefix_' +
                                                   str(len(ts[str(tid)]))]
                ['predicted'] == 'true' and ts[str(tid)]
                ['prefix_' +
                 str(len(ts[str(tid)]))]['predicted'] == ('true' if gold[
                     gold['trace_id'] == tid]['label'].values[0] else 'false')
            ],
            'tn': [
                str(tid) for tid in trace_ids
                if str(tid) in ts and ts[str(tid)]['prefix_' +
                                                   str(len(ts[str(tid)]))]
                ['predicted'] == 'false' and ts[str(tid)]
                ['prefix_' +
                 str(len(ts[str(tid)]))]['predicted'] == ('true' if gold[
                     gold['trace_id'] == tid]['label'].values[0] else 'false')
            ],
            'fp': [
                str(tid) for tid in trace_ids
                if str(tid) in ts and ts[str(tid)][
                    'prefix_' + str(len(ts[str(tid)]))]['predicted'] == 'true'
                and ts[str(tid)]['prefix_' + str(len(ts[str(tid)]))]
                ['predicted'] != ('true' if gold[
                    gold['trace_id'] == tid]['label'].values[0] else 'false')
            ],
            'fn': [
                str(tid) for tid in trace_ids
                if str(tid) in ts and ts[str(tid)][
                    'prefix_' + str(len(ts[str(tid)]))]['predicted'] == 'false'
                and ts[str(tid)]['prefix_' + str(len(ts[str(tid)]))]
                ['predicted'] != ('true' if gold[
                    gold['trace_id'] == tid]['label'].values[0] else 'false')
            ]
        }

        limefeats = {
            k: {
                key: [
                    element for element in sorted(
                        [(pref, lime_ts[key]
                          ['prefix_' +
                           str(job_obj.encoding.prefix_length)][pref]['value'],
                          lime_ts[key]['prefix_' +
                                       str(job_obj.encoding.prefix_length)]
                          [pref]['importance']) for pref in lime_ts[key]
                         ['prefix_' + str(job_obj.encoding.prefix_length)]],
                        key=lambda x: (x[2], x[1]),
                        reverse=True if k in ['tp', 'fp'] else False
                        # reverse order of lime values if the prediction is negative
                    )
                ]
                for key in confusion_matrix[k] if 'prefix_' +
                str(job_obj.encoding.prefix_length) in lime_ts[key]
            }
            for k in confusion_matrix
        }

        freq_seqs = {'tp': {}, 'tn': {}, 'fp': {}, 'fn': {}}

        # todo: retrive patterns
        CONFUSION_MATRIX = ['tp', 'tn', 'fp', 'fn']

        LIMEFEATS = {
            'abs_lime': False,
            'tp': 0.2,
            'tn': 0.2,
            'fp': 0.2,
            'fn': 0.2,
            'top': 10,
            'outputfile': None
        }
        FREQ_SEQS = {
            'tp': 10,
            'tn': 10,
            'fp': 10,
            'fn': 10,
            'top': 15,
            'outputfile': None,
            'RECOMPUTEDoutputfile': None,
        }
        ABSENCE = {
            'tp': 0.1,
            'tn': 0.1,
            'fp': 0.1,
            'fn': 0.1,
            'ABSENCEoutputfile': None
        }

        MINING_METHOD = 'item_mining'

        print(
            'Initial CONFUSION MATRIX:\n', *[
                '\tlimefeats ' + KEY + ':' + str(len(limefeats[KEY]))
                for KEY in CONFUSION_MATRIX
            ], '\n', *[
                '\tfreq_seqs ' + KEY + ':' + str(len(freq_seqs[KEY]))
                for KEY in CONFUSION_MATRIX
            ])

        available_values = {}
        for KEY in CONFUSION_MATRIX:
            available_values[KEY] = {}
            for tid in limefeats[KEY]:
                for event in limefeats[KEY][tid]:
                    if event[0].split('_')[0] not in available_values[KEY]:
                        available_values[KEY][event[0].split('_')[0]] = set()
                    available_values[KEY][event[0].split('_')[0]].add(event[1])

        filtered_limefeats = {
            KEY: {
                tid: [
                    event for event in limefeats[KEY][tid]
                    if ((not LIMEFEATS['abs_lime']) and (
                        (KEY in ['tp', 'fp'] and event[2] >= LIMEFEATS[KEY]) or
                        (KEY in ['tn', 'fn'] and event[2] <= -LIMEFEATS[KEY])))
                    or
                    (LIMEFEATS['abs_lime'] and abs(event[2]) >= LIMEFEATS[KEY])
                ]
                for tid in limefeats[KEY]
            }
            for KEY in CONFUSION_MATRIX
        }

        prefiltered_limefeats = {
            KEY: {
                tid: [
                    event for event in limefeats[KEY][tid]
                    if ((not LIMEFEATS['abs_lime']) and (
                        (KEY in ['tp', 'fp'] and event[2] >= LIMEFEATS[KEY]) or
                        (KEY in ['tn', 'fn'] and event[2] <= -LIMEFEATS[KEY])))
                    or
                    (LIMEFEATS['abs_lime'] and abs(event[2]) >= LIMEFEATS[KEY])
                ]
                for tid in limefeats[KEY]
            }
            for KEY in CONFUSION_MATRIX
        }

        filtered_limefeats_mine = {
            KEY: {
                tid: prefiltered_limefeats[KEY][tid][0:LIMEFEATS['top']]
                for tid in prefiltered_limefeats[KEY]
            }
            for KEY in CONFUSION_MATRIX
        }

        for KEY in CONFUSION_MATRIX:
            for k in list(filtered_limefeats[KEY]):
                if len(filtered_limefeats[KEY][k]) == 0:
                    del filtered_limefeats[KEY][k]

        def tassellate_numbers(element):
            element = str(element)
            return str(element).split('.')[0][0] + '0' \
                if \
                '.' in str(element) \
                and \
                len(str(element)) <= 5 \
                else \
                str(element).split('.')[0][0:4] \
                    if \
                    '.' in str(element) \
                    and \
                    len(str(element)) >= 10 \
                    else \
                    element

        def retrieve_right_len(element, available_values):
            if '_' in element:
                return len(available_values[element.split('_')[0]])
            else:
                retval = []
                for attribute in available_values:
                    if any([
                            str(element) == str(tassellate_numbers(value))
                            for value in available_values[attribute]
                    ]):
                        retval += [len(available_values[attribute])]
                return max(retval)

        def weight_freq_seqs(KEY, available_values, element, limefeats):
            print(element[0])
            print(
                'frequency:', element[1], ' * ', 'len w/out absences: ',
                len([el for el in element[0] if 'absence' not in el]), ' * ',
                'sum of enumerator of possible values: ',
                sum([
                    retrieve_right_len(el, available_values[KEY])
                    for el in element[0] if 'absence' not in el
                ]), ' / ',
                'amount of examples in the field of confusion matrix: ',
                len(limefeats[KEY]), ' = ',
                (element[1] *
                 len([el for el in element[0] if 'absence' not in el]) * sum([
                     retrieve_right_len(el, available_values[KEY])
                     for el in element[0] if 'absence' not in el
                 ])) / len(limefeats[KEY]))
            return (
                element[1]  # *
                # len([el for el in element[0] if 'absence' not in el]) *
                # sum([retrieve_right_len(el, available_values[KEY]) for el in element[0] if 'absence' not in el])
            ) / len(limefeats[KEY])

        filtered_freq_seqs_old = {
            KEY: sorted([
                element for element in freq_seqs[KEY]
                if weight_freq_seqs(KEY, available_values, element, limefeats)
                >= FREQ_SEQS[KEY]
            ],
                        key=lambda x: x[1],
                        reverse=True)
            for KEY in CONFUSION_MATRIX
        }

        prefiltered_freq_seqs = {
            KEY: sorted([
                element for element in freq_seqs[KEY]
                if weight_freq_seqs(KEY, available_values, element, limefeats)
                >= FREQ_SEQS[KEY]
            ],
                        key=lambda x: x[1],
                        reverse=True)
            for KEY in CONFUSION_MATRIX
        }

        #todo: is this the actual topK?
        filtered_freq_seqs = {
            KEY: prefiltered_freq_seqs[KEY][0:FREQ_SEQS['top']]
            for KEY in CONFUSION_MATRIX
        }

        print(
            'CONFUSION MATRIX after filtering:\n', *[
                '\tlimefeats ' + KEY + ':' + str(len(filtered_limefeats[KEY]))
                for KEY in CONFUSION_MATRIX
            ], '\n', *[
                '\tfreq_seqs ' + KEY + ':' + str(len(filtered_freq_seqs[KEY]))
                for KEY in CONFUSION_MATRIX
            ])

        def printout_freq_seqs(output_obj, output_file, maxlinelength=5000):
            with open(output_file, 'w+') as f:
                f.write(prettyjson(output_obj, maxlinelength=maxlinelength))

        if (LIMEFEATS['outputfile'] is not None
                or FREQ_SEQS['outputfile'] is not None):
            print('Start saving results..')
            if (LIMEFEATS['outputfile'] is not None):
                printout_freq_seqs(filtered_limefeats,
                                   LIMEFEATS['outputfile'],
                                   maxlinelength=5000)
            if (FREQ_SEQS['outputfile'] is not None):
                printout_freq_seqs(filtered_freq_seqs,
                                   FREQ_SEQS['outputfile'],
                                   maxlinelength=200)
            print('Results saved.')
        else:
            print('FILTERED_LIMEFEATS:\n', filtered_limefeats)
            print('FILTERED_FREQ_SEQS:\n', filtered_freq_seqs)

        print('Computing absence...')

        attributes = {}
        for KEY in CONFUSION_MATRIX:
            for tid in limefeats[KEY]:
                for event in limefeats[KEY][tid]:
                    attribute_name = event[0]
                    if attribute_name not in attributes:
                        attributes[attribute_name] = set()
                    attributes[attribute_name].add(event[1])

        attributes_occurrences = {
            'tp': collections.Counter(),
            'fp': collections.Counter(),
            'tn': collections.Counter(),
            'fn': collections.Counter()
        }

        for KEY in CONFUSION_MATRIX:
            found_stuff = []
            for tid in limefeats[KEY]:
                for event in limefeats[KEY][tid]:
                    found_stuff += [tassellate_numbers(event[1])]

            attributes_occurrences[KEY].update(found_stuff)

        characterised_attributes_occurrences = {}
        for KEY in CONFUSION_MATRIX:
            characterised_attributes_occurrences[KEY] = {}
            for attribute in attributes:
                if attribute not in characterised_attributes_occurrences[KEY]:
                    characterised_attributes_occurrences[KEY][
                        attribute] = dict()
                for attr in attributes[attribute]:
                    characterised_attributes_occurrences[KEY][attribute][
                        tassellate_numbers(attr)] = 0
        for KEY in CONFUSION_MATRIX:
            for occ in attributes_occurrences[KEY]:
                for attr in characterised_attributes_occurrences[KEY]:
                    if occ in characterised_attributes_occurrences[KEY][attr]:
                        characterised_attributes_occurrences[KEY][attr][
                            occ] = attributes_occurrences[KEY][occ]
            for attr in characterised_attributes_occurrences[KEY]:
                characterised_attributes_occurrences[KEY][attr]['Total'] = sum(
                    [
                        characterised_attributes_occurrences[KEY][attr]
                        [element] for element in
                        characterised_attributes_occurrences[KEY][attr]
                    ])
        print('Absence computed.')
        print('The absence AFTER filtering is:\n',
              characterised_attributes_occurrences)

        print(
            'RE-computing the sequence pattern result after applying the thresholds...'
        )

        static_attr = [
            #    'Age',
            #    'ClaimValue',
            #    'CType',
            #    'ClType',
            #    'PClaims',
        ]
        limefeats_static_dinamic = {}
        for KEY in CONFUSION_MATRIX:
            limefeats_static_dinamic[KEY] = {}
            for tid in filtered_limefeats[KEY]:
                limefeats_static_dinamic[KEY][tid] = {
                    'static': [],
                    'dynamic': [
                        att for att in filtered_limefeats[KEY][tid]
                        if not any([
                            att[0].startswith(static_att)
                            for static_att in static_attr
                        ])
                    ]
                }
                current_static_attributes = [
                    att for att in filtered_limefeats[KEY][tid] if any([
                        att[0].startswith(static_att)
                        for static_att in static_attr
                    ])
                ]
                for s_attr in static_attr:
                    curr_attributes = [
                        att for att in current_static_attributes
                        if att[0].startswith(s_attr)
                    ]
                    if len(curr_attributes) > 0:
                        if KEY in ['tp', 'fp']:
                            limefeats_static_dinamic[KEY][tid]['static'] += [
                                max(curr_attributes, key=lambda x: x[2])
                            ]
                        elif KEY in ['tn', 'fn']:
                            limefeats_static_dinamic[KEY][tid]['static'] += [
                                max(curr_attributes, key=lambda x: x[2])
                            ]
                        else:
                            print('Something bad happened')

        dynamic_data = {
            KEY: {
                tid: [
                    # (element[0].split('_')[0] + '_' +  element[1])
                    (element[0] + '_' + element[1]) for element in sorted(
                        [
                            k for k in limefeats_static_dinamic[KEY][tid]
                            ['dynamic']
                        ],
                        # key=lambda x: (x[0].split('_')[1], x[0].split('_')[0])
                        key=lambda x: x[0])
                ]
                for tid in limefeats_static_dinamic[KEY]
                if len(limefeats_static_dinamic[KEY][tid]['dynamic']) > 0
            }
            for KEY in CONFUSION_MATRIX
        }

        static_data = {
            KEY: {
                tid: [
                    (element[0].split('_')[0] + '_' +
                     tassellate_numbers(element[1]))
                    # (element[0] + '_' + tassellate_numbers(element[1]))
                    for element in sorted([
                        k for k in limefeats_static_dinamic[KEY][tid]['static']
                    ],
                                          key=lambda x: (x[0].split('_')[1], x[
                                              0].split('_')[0]))
                ]
                for tid in limefeats_static_dinamic[KEY]
                if len(limefeats_static_dinamic[KEY][tid]['static']) > 0
            }
            for KEY in CONFUSION_MATRIX
        }

        data = {}
        for KEY in CONFUSION_MATRIX:
            data[KEY] = {}
            for tid in limefeats[KEY]:
                if tid in static_data[KEY] and tid in dynamic_data[KEY]:
                    data[KEY][
                        tid] = static_data[KEY][tid] + dynamic_data[KEY][tid]
                elif tid in static_data[KEY]:
                    data[KEY][tid] = static_data[KEY][tid]
                elif tid in dynamic_data[KEY]:
                    data[KEY][tid] = dynamic_data[KEY][tid]

        if (MINING_METHOD == 'seq_mining'):
            freq_seqs_after_filter = {
                'tp':
                sorted(
                    seqmining.freq_seq_enum(
                        [data['tp'][tid] for tid in data['tp']], 2)),
                'tn':
                sorted(
                    seqmining.freq_seq_enum(
                        [data['tn'][tid] for tid in data['tn']], 2)),
                'fp':
                sorted(
                    seqmining.freq_seq_enum(
                        [data['fp'][tid] for tid in data['fp']], 2)),
                'fn':
                sorted(
                    seqmining.freq_seq_enum(
                        [data['fn'][tid] for tid in data['fn']], 2)),
            }
        if (MINING_METHOD == 'item_mining'):
            freq_seqs_after_filter = {
                'tp':
                itemmining.relim(itemmining.get_relim_input(
                    [data['tp'][tid] for tid in data['tp']]),
                                 min_support=2),
                'tn':
                itemmining.relim(itemmining.get_relim_input(
                    [data['tn'][tid] for tid in data['tn']]),
                                 min_support=2),
                'fp':
                itemmining.relim(itemmining.get_relim_input(
                    [data['fp'][tid] for tid in data['fp']]),
                                 min_support=2),
                'fn':
                itemmining.relim(itemmining.get_relim_input(
                    [data['fn'][tid] for tid in data['fn']]),
                                 min_support=2),
            }

            freq_seqs_after_filter = {
                KEY: [(tuple(element), freq_seqs_after_filter[KEY][element])
                      for element in freq_seqs_after_filter[KEY]]
                for KEY in CONFUSION_MATRIX
            }

        filtered_freq_seqs_after_filter_old = {
            KEY: sorted([[
                element[0],
                weight_freq_seqs(KEY, available_values, element, limefeats)
            ] for element in freq_seqs_after_filter[KEY] if weight_freq_seqs(
                KEY, available_values, element, limefeats) >= FREQ_SEQS[KEY]],
                        key=lambda x: x[1],
                        reverse=True)
            for KEY in CONFUSION_MATRIX
        }

        # todo: filter topK
        filtered_freq_seqs_after_filter = {
            KEY: filtered_freq_seqs_after_filter_old[KEY][0:FREQ_SEQS['top']]
            for KEY in CONFUSION_MATRIX
        }

        print('Sequence pattern recomputed successfully.')

        if (FREQ_SEQS['outputfile'] is not None):
            print('Start saving results..')
            printout_freq_seqs(filtered_freq_seqs_after_filter,
                               FREQ_SEQS['RECOMPUTEDoutputfile'],
                               maxlinelength=200)
            print('Results saved.')
        else:
            print('RECOMPUTED_FREQ_SEQS:\n', filtered_freq_seqs_after_filter)

        print('Done, cheers!')
        return confusion_matrix, data, freq_seqs_after_filter, filtered_freq_seqs_after_filter
def calculate_hyperopt(job: Job) -> (dict, dict, dict):
    """main entry method for hyperopt calculations
    returns the predictive_model for the best trial

    :param job: job configuration
    :return: tuple containing the results, config and predictive_model split from the search
    """

    logger.info("Start hyperopt job {} with {}, performance_metric {}".format(
        job.type, get_run(job),
        job.hyperparameter_optimizer.__getattribute__(
            job.hyperparameter_optimizer.optimization_method.lower()
        ).performance_metric) #Todo: WHY DO I NEED TO GET HYPEROPT?
    )

    global training_df, test_df, global_job
    global_job = job
    training_df, test_df = get_encoded_logs(job)
    #TODO evaluate on validation set
    if holdout:
        validation_df = test_df
        # test_df = training_df.sample(frac=.2)
        test_df = training_df.tail(int(len(training_df) * 20 / 100))
        training_df = training_df.drop(test_df.index)

    train_start_time = time.time()

    space = _get_space(job)

    max_evaluations = job.hyperparameter_optimizer.__getattribute__(
            job.hyperparameter_optimizer.optimization_method.lower()
        ).max_evaluations #Todo: WHY DO I NEED TO GET HYPEROPT?
    trials = Trials()

    algorithm = _choose_algorithm(job)

    try:
        fmin(_calculate_and_evaluate, space, algo=algorithm.suggest, max_evals=max_evaluations, trials=trials)
    except ValueError:
        raise ValueError("All jobs failed, cannot find best configuration")
    current_best = {'loss': 100, 'results': {}, 'predictive_model_id': {}, 'model_split': {}, 'config': {}}
    for trial in trials:
        a = trial['result']
        if current_best['loss'] > a['loss']:
            current_best = a

    job.predictive_model = PredictiveModel.objects.filter(pk=current_best['predictive_model_id'])[0]
    job.predictive_model.save()
    job.save()

    current_best['results']['elapsed_time'] = timedelta(seconds=time.time() - train_start_time)  # todo find better place for this
    job.evaluation.elapsed_time = current_best['results']['elapsed_time']
    job.evaluation.save()

    #TODO evaluate on validation set
    if holdout:
        results_df, auc = _test(
            current_best['model_split'],
            validation_df.drop(['trace_id'], 1),
            evaluation=True,
            is_binary_classifier=_check_is_binary_classifier(job.labelling.type)
        )
        results = _prepare_results(results_df, auc)
        results['elapsed_time'] = job.evaluation.elapsed_time
        job.evaluation = Evaluation.init(
            job.predictive_model.predictive_model,
            results,
            len(set(test_df['label'])) <= 2
        )
        job.evaluation.save()
        job.save()

    if holdout:
        logger.info("End hyperopt job {}, {}. \n\tResults on test {}. \n\tResults on validation {}.".format(job.type, get_run(job), current_best['results'], results))
        return results, current_best['config'], current_best['model_split']
    else:
        logger.info("End hyperopt job {}, {}. \n\tResults on test {}.".format(job.type, get_run(job), current_best['results']))
        return current_best['results'], current_best['config'], current_best['model_split']
Exemple #17
0
    def handle(self, *args, **kwargs):
        TARGET_JOB = 439
        initial_job_obj = Job.objects.filter(pk=TARGET_JOB)[0]

        # todo: return performances
        print('Initial Job:', initial_job_obj.evaluation.classificationmetrics
              )  # TODO future bug

        training_df_old, test_df_old = get_encoded_logs(initial_job_obj)
        training_df = training_df_old.copy()
        test_df = test_df_old.copy()

        # todo: what should I randomise?
        TARGETS = [
            [('prefix_1', 2)],  # <- simple pattern
            [('prefix_2', 3)],  # <- simple pattern
            [
                ('prefix_3', 2),
                ('prefix_4', 3),
            ]  # <- complex pattern
        ]
        for target in TARGETS:
            if len(target) == 1:
                target = target[0]
                for df in [training_df, test_df]:
                    m_col = df[target[0]]
                    del df[target[0]]
                    target_values1 = list(set(m_col.values))
                    df[target[0]] = m_col.apply(lambda x: x if (x != target[
                        1]) else random.choice(target_values1))
            elif len(target) > 1:
                for df in [training_df, test_df]:
                    m_col = df[[column for column, _ in target]]
                    possible_values = {}
                    for column, _ in target:
                        possible_values[column] = list(set(df[column]))
                        del df[column]
                    df[[column for column, _ in target
                        ]] = m_col.apply(lambda x: x if any(
                            [x[column] != value
                             for column, value in target]) else Series({
                                 column: random.choice(possible_values[column])
                                 for column, value in target
                             }),
                                         axis=1)
            else:
                raise Exception('target list with unexpected value')

        assert not training_df.equals(training_df_old)
        assert not test_df.equals(test_df_old)

        # todo: save new dataset in memory and create split to use it
        initial_split_obj = initial_job_obj.split
        new_split = duplicate_orm_row(initial_split_obj)
        train_log = duplicate_orm_row(new_split.train_log)
        test_log = duplicate_orm_row(new_split.test_log)

        # TODO future bug creates shadows
        train_log.name = 'RETRAIN' + train_log.name
        train_log.path = 'cache/log_cache/' + train_log.name
        train_log.properties = {}
        test_log.name = 'RETRAIN' + test_log.name
        test_log.path = 'cache/log_cache/' + test_log.name
        test_log.properties = {}

        new_split.train_log = train_log
        new_split.test_log = test_log
        new_split.additional_columns = None
        new_split.save()

        prediction_job = create_prediction_job(
            initial_job_obj, initial_job_obj.encoding.prefix_length)
        prediction_job.split = new_split
        prediction_job.split.save()
        prediction_job.save()

        put_labelled_logs(prediction_job, training_df, test_df)

        # todo: build model
        prediction_task(prediction_job.id, do_publish_result=False)
        prediction_job.refresh_from_db()

        # todo: return performances
        print('Retrain Job:', prediction_job.evaluation.classificationmetrics)

        print('Done, cheers!')