Ejemplo n.º 1
0
def parse_ms(s):
    fn_data = Name_functions.DS_file(s)
    x, y, time, case_id = Di(fn_data).get_data(return_identifiers=True,
                                               return_split_values=True)

    print('\tM^{}_j ... '.format(s), end='', flush=True)

    # S predictions
    for i in sorted([int(i) for i in Name_functions.S_J_values(s)],
                    reverse=True):

        if Filefunctions.exists(Name_functions.DSJ_probabilities(s, i)):
            continue

        model_i = Model_Functions.loadModel(Name_functions.model_SJ(s, i))
        model_labels = model_i.classes_.tolist()
        model_end_time = Name_functions.SJ_period_end_time(s, i)

        with open(Name_functions.DSJ_probabilities(s, i), 'w+') as wf:
            for dx, t, idn in zip(x, time, case_id):
                if t < model_end_time:
                    # Only test if the model existed before the data point
                    continue
                model_predictions = model_i.predict_proba(dx.reshape(1, -1))[0]
                actual_predictions = [
                    (0 if (i not in model_labels) else
                     model_predictions[model_labels.index(i)])
                    for i in all_labels
                ]
                wf.write('{};{};{}\n'.format(
                    idn, t,
                    ';'.join(['{:4f}'.format(x) for x in actual_predictions])))
    print('Done')

    # Naive predictions
    print('\tM^{}_naive ... '.format(s), end='', flush=True)
    if Filefunctions.exists(Name_functions.DS_probabilities_naive(s)):
        print('Already done')
        return

    model_naive = Model_Functions.loadModel(Name_functions.model_S_naive(s))
    model_naive_labels = model_naive.classes_.tolist()
    model_naive_end_time = Parameters.train_time_naive_stop

    with open(Name_functions.DS_probabilities_naive(s), 'w+') as wf:
        for dx, t, idn in zip(x, time, case_id):
            if t < model_naive_end_time:
                # Only test if the model existed before the data point
                continue

            model_predictions = model_naive.predict_proba(dx.reshape(1, -1))[0]
            actual_predictions = [
                (0 if (i not in model_naive_labels) else
                 model_predictions[model_naive_labels.index(i)])
                for i in all_labels
            ]
            wf.write('{};{};{}\n'.format(
                idn, t,
                ';'.join(['{:4f}'.format(x) for x in actual_predictions])))
    print('Done')
Ejemplo n.º 2
0
def create_labelled_dataset(event_log, s, feature_filename, output_file):
    # Check or create Event Log
    assert (isinstance(s, int))
    if isinstance(event_log, EventLog):
        pass
    elif isinstance(event_log, str):
        event_log = EventLog(filename=event_log)
    else:
        raise Exception('Given argument should be filename or EventLog')

    # Dictionary with all labels
    labels = dict()

    # Each split of the full dataset gets parsed separately
    splits = event_log.get_splits(s)
    for split in splits.values():
        labels.update(algorithm(split))

    # Create labelled data
    with open(feature_filename, 'r') as rf:
        Filefunctions.makeParentDir(output_file)

        with open(output_file, 'w+') as wf:
            wf.write(rf.readline()[:-1] + ';Case_start;Class\n')
            wf.write(rf.readline()[:-1] + ';-;-\n')
            wf.write(rf.readline()[:-1] + ';-;-\n')
            wf.write(rf.readline()[:-1] + ';SPLIT;Y\n')
            for line in rf.readlines():
                values = line[:-1].split(';')
                case = event_log.get_case(values[0])
                values.append(str(case.get_start()))
                values.append(labels[values[0]])
                wf.write(';'.join(values) + '\n')
Ejemplo n.º 3
0
def parse_ms(s):
    print('D^{} ... '.format(s), end='', flush=True)
    if Filefunctions.exists(Name_functions.DS_train_ids(s)):
        if Filefunctions.exists(Name_functions.DS_test_ids(s)):
            print('Already done')
            return

    np.random.seed(0)
    X, y, times, ids = DI(Name_functions.DS_file(s)).get_data(
        Name_functions.DS_reduced_ids_DSJ(s), True, True)

    if Parameters.take_test_split_chronological:
        test_case_ids = []
        train_case_ids = []
        times_post_warm_up = [
            t for t in times if t > Parameters.test_time_start
        ]
        times_post_warm_up.sort()
        train_start_index = int(
            (1 - Parameters.assessment_test_split) * len(times_post_warm_up))
        train_time_end = times_post_warm_up[train_start_index]
        for case_start_time, case_id in zip(times, ids):
            if case_start_time <= Parameters.test_time_start:
                continue

            if case_start_time < train_time_end:
                train_case_ids.append(case_id)
            else:
                test_case_ids.append(case_id)
    else:
        indices = [
            i for i in range(len(ids)) if times[i] > Parameters.test_time_start
        ]
        test_indices = []
        train_indices = []
        c, cc = np.unique(y[indices], return_counts=True)
        for label, label_count in zip(c, cc):
            num_test = int(label_count * Parameters.assessment_test_split)
            indices_c = [i for i in indices if y[i] == label]
            indices_c_test = np.random.choice(indices_c,
                                              num_test,
                                              replace=False)
            test_indices.extend(indices_c_test.tolist())
            train_indices.extend(
                [i for i in indices_c if i not in indices_c_test])
        test_case_ids = ids[test_indices]
        train_case_ids = ids[train_indices]

    with open(Name_functions.DS_train_ids(s), 'w+') as wf:
        for case_id in train_case_ids:
            wf.write('{}\n'.format(case_id))

    with open(Name_functions.DS_test_ids(s), 'w+') as wf:
        for case_id in test_case_ids:
            wf.write('{}\n'.format(case_id))

    print('Done')
Ejemplo n.º 4
0
def run():
    fn = Name_functions.full_GRAEC_table()

    # Clear Test Scores file
    with open(fn, 'w+') as wf:
        wf.write('Beta;Tau;P;S;Score_type;Score_Value\n')

    for S in Parameters.S_values:
        Filefunctions.make_directory(Name_functions.S_score_folder(S))
        print('S = {}'.format(S))
        parse_graec(S)
        parse_previous(S)
        parse_naive(S)
Ejemplo n.º 5
0
def drop_feature(fn, feature, fn_out=None):
    Filefunctions.exists_assert(fn)
    assert (isinstance(feature, str) or isinstance(feature, int))
    df = import_df(fn)
    if isinstance(feature, int):
        feature = df.columns[feature]

    if feature not in df.columns:
        warn('Feature is not a column')
        return

    df.drop(labels=[feature], axis=1, inplace=True)
    if fn_out is None:
        fn_out = fn
    export_df(df, fn_out)
    def _eval_previous(self):
        print('Parsing Previous ... ', end='', flush=True)
        fn_recent = Name_functions.parameter_evaluation_evaluation_metric_file('Previous')
        if Filefunctions.exists(fn_recent):
            print('Already done')
            return

        with open(fn_recent, 'w+') as wf:
            wf.write('S;Day;NumEntries;accuracy;f1\n')
            for S in self.Multi['S']:
                predictor = Classifiers.PreviousClassifier(S)
                fn = Name_functions.DS_file(S)
                _, labels, times, ids = Di(fn).get_data(fn_subset_ids=self.test_ids_fn,
                                                        return_split_values=True,
                                                        return_identifiers=True)
                data = pd.DataFrame(index=ids)
                data['time'] = times
                data['y_true'] = [l[0] for l in labels]
                data['Day'] = np.floor(data['time'])

                # Calculate the accuracy score for each day
                for day in data['Day'].unique():
                    subset = data[data['Day'] == day]
                    acc_score, f1_score = self.get_scores(predictor=predictor,
                                                          true_labels=subset['y_true'],
                                                          times=subset['time'],
                                                          ids=subset.index
                                                          )
                    if not (acc_score is None or f1_score is None):
                        wf.write('{};{};{};{};{}\n'.format(S,
                                                           day,
                                                           len(subset),
                                                           acc_score,
                                                           f1_score))
        print('Done')
Ejemplo n.º 7
0
def parse_naive(s):
    fn_target = Name_functions.DS_reduced_ids_naive(s)

    print('\tD^S_naive ... ', end='', flush=True)
    # Check existence
    if Filefunctions.exists(fn_target):
        print('Already done')
        return

    fn_input = Name_functions.DS_file(s)
    x, y, timestamps, ids = DataImporter(fn_input).get_data(
        return_identifiers=True, return_split_values=True)

    first_year_indices = [
        i for i in range(len(timestamps))
        if timestamps[i] < Parameters.train_time_naive_stop
    ]
    x = x[first_year_indices]
    y = y[first_year_indices]
    ids = ids[first_year_indices]
    x, y, medoid_indices = KMedoids.reduce_to_medoids(
        x, y, return_indices=True, factor=Parameters.LargeSmallFactor)
    ids_keep = [ids[i] for i in medoid_indices]

    with open(fn_target, 'w+') as wf:
        for CaseID in ids_keep:
            wf.write('{}\n'.format(CaseID))

    print('Done')
Ejemplo n.º 8
0
def load_dict_from_csv(filename):
    assert (Filefunctions.exists(filename))
    ret = dict()
    with open(filename, 'r') as rf:
        for line in rf.readlines():
            k, v = line[:-1].split(';', 1)
            ret[k] = v
    return ret
Ejemplo n.º 9
0
def sort_file_on_date(fn_in, feature, fn_out=None):
    assert (isinstance(feature, str) or isinstance(
        feature, int)), 'Feature must be str or int: {}'.format(feature)
    Filefunctions.exists_assert(fn_in)
    df = import_df(fn_in)

    if isinstance(feature, int):
        feature = df.columns[feature]

    fixDates(frame=df, date_column_string=feature)

    df.sort_values(by=[feature], ascending=True, inplace=True)

    if fn_out is None:
        fn_out = fn_in

    export_df(df, fn_out)
Ejemplo n.º 10
0
 def __init__(self, filename):
     assert Filefunctions.exists(filename)
     # We implement the event log as a dict, this allows easier reference when adding an event to the case
     # Since each event has a reference to a case_id, not to a case itself
     self.cases = dict()
     with open(filename, 'r') as rf:
         for line in rf:
             # for each line
             case_id, timestamp, act = line[:-1].split(';')
             # create an event
             e = Event(case_id=case_id, time=float(timestamp), act=act)
             # add it to the corresponding case (or create the new case if necessary)
             self.cases.setdefault(case_id,
                                   Case(case_id=case_id)).add_event(e)
Ejemplo n.º 11
0
    def split_data(self,
                   interval,
                   fn_subset_ids=None,
                   return_split_values=False,
                   return_identifiers=False):
        if fn_subset_ids is not None:
            if not Filefunctions.exists(fn_subset_ids):
                raise Exception(
                    'File does not exist:\n{}'.format(fn_subset_ids))

        keep_idx = None
        if fn_subset_ids is not None:
            with open(fn_subset_ids, 'r') as rf:
                keep_ids = [line[:-1] for line in rf.readlines()]
            keep_idx = [
                i for i in range(len(self.IDS)) if self.IDS[i] in keep_ids
            ]

        index_list = dict()
        for (i, s) in enumerate(self.split_values):
            if keep_idx is not None and i not in keep_idx:
                continue
            group = math.floor(float(s) / interval)
            index_list.setdefault(group, []).append(i)

        return_x = dict()
        return_y = dict()
        return_split = dict()
        return_ids = dict()

        for (group, indices) in index_list.items():
            return_x[group] = self.X.toarray()[indices]
            return_y[group] = self.y[indices]
            if return_split_values:
                return_split[group] = self.split_values[indices]
            if return_identifiers:
                return_ids[group] = self.IDS[indices]

        ret = (
            return_x,
            return_y,
        )
        ret += ((return_split, ) if return_split_values else ())
        ret += ((return_ids, ) if return_identifiers else ())
        return ret
Ejemplo n.º 12
0
    def _eval_param(self, evaluated_parameter):
        print('Parsing parameter {} ... '.format(evaluated_parameter), end='', flush=True)
        fn = Name_functions.parameter_evaluation_evaluation_metric_file(evaluated_parameter)
        if Filefunctions.exists(fn):
            print('Already done')
            return

        with open(fn, 'w+') as wf:
            wf.write('S;Beta;Tau;P;Day;NumEntries;accuracy;f1\n')
            for S in self.values(evaluated_parameter, 'S'):
                predictor = Classifiers.BPTSClassifier(s=S, score_function=None)
                fn = Name_functions.DS_file(S)
                _, labels, times, ids = Di(fn).get_data(fn_subset_ids=self.test_ids_fn, return_split_values=True,
                                                        return_identifiers=True)
                data = pd.DataFrame(index=ids)
                data['time'] = times
                data['y_true'] = [l[0] for l in labels]
                data['Day'] = np.floor(data['time'])

                for beta in self.values(evaluated_parameter, 'Beta'):
                    for p in self.values(evaluated_parameter, 'P'):
                        for tau in self.values(evaluated_parameter, 'Tau'):
                            scoring_function = PeriodScoring(s=S, beta=beta, tau=tau, p=p)
                            predictor.set_scoring_function(scoring_function)
                            for day in data['Day'].unique():
                                subset = data[data['Day'] == day]
                                acc_score, f1_score = self.get_scores(predictor=predictor,
                                                                      ids=subset.index,
                                                                      times=subset['time'],
                                                                      true_labels=subset['y_true'],
                                                                      )
                                if not (acc_score is None or f1_score is None):
                                    wf.write('{};{};{};{};{};{};{};{}\n'.format(S,
                                                                                beta,
                                                                                tau,
                                                                                p,
                                                                                day,
                                                                                len(subset),
                                                                                acc_score,
                                                                                f1_score))
        print('Done')
Ejemplo n.º 13
0
def parse_ms(s):
    fn_target = Name_functions.DS_reduced_ids_DSJ(s)

    # Check existence
    print('\tD^S_j ... ', end='', flush=True)
    if Filefunctions.exists(fn_target):
        print('Already done')
        return

    fn_input = Name_functions.DS_file(s)
    x, y, ids = DataImporter(fn_input).split_data(int(s),
                                                  return_identifiers=True)
    ids_keep = []
    for i in sorted(x):
        xi, yi, indices = KMedoids.reduce_to_medoids(x[i],
                                                     y[i],
                                                     return_indices=True)
        ids_keep.extend([ids[i][j] for j in indices])

    with open(fn_target, 'w+') as wf:
        for caseID in ids_keep:
            wf.write('{}\n'.format(caseID))

    print('Done')
Ejemplo n.º 14
0
    def get_data(self,
                 fn_subset_ids=None,
                 return_split_values=False,
                 return_identifiers=False):

        if fn_subset_ids is None:
            ret = (self.X.toarray(), self.y)
            ret += (self.split_values, ) if return_split_values else ()
            ret += (self.IDS, ) if return_identifiers else ()
        else:
            if not Filefunctions.exists(fn_subset_ids):
                raise Exception(
                    'File does not exist:\n{}'.format(fn_subset_ids))
            with open(fn_subset_ids, 'r') as rf:
                keep_ids = [line[:-1] for line in rf.readlines()]
            keep_idx = [
                i for i in range(len(self.IDS)) if self.IDS[i] in keep_ids
            ]
            ret = (self.X.toarray()[keep_idx], self.y[keep_idx])
            ret += (
                self.split_values[keep_idx], ) if return_split_values else ()
            ret += (self.IDS[keep_idx], ) if return_identifiers else ()

        return ret
Ejemplo n.º 15
0
# This script runs all other scripts in the project. MS stands for Moment / S, since experiments differ in the moment in
# in the process when the prediction takes place, and the subset period length S


def print_state(s):
    print('-' * 25)
    print(s)
    print('-' * 25)


if Parameters.Demo:
    print('Running Demo')
    print_state('Building Dataset')
    # Remove old folder
    Filefunctions.delete(Parameters.root_location)
    # Create new folder
    Filefunctions.make_directory(Parameters.root_location)
    DEMO_CREATE_EVENTLOG.run()
else:
    print('Running Real Dataset')

print_state('Splitting and labeling event log')
STEP0_Split_And_Label.run()

# Reduce the size of the data set if needed
print_state('Reducing Data Size')
STEP1_Reducing_Class_Sizes.run()

# Train all models
print_state('Training')
Ejemplo n.º 16
0
def save_dict_to_csv(dictionary, filename):
    assert (isinstance(dictionary, dict))
    Filefunctions.makeParentDir(filename)
    with open(filename, 'w+') as wf:
        for k, v in dictionary.items():
            wf.write('{};{}\n'.format(k, v))
Ejemplo n.º 17
0
def undo():
    for S in Parameters.S_values:
        Filefunctions.delete(Name_functions.S_score_folder(S))
    STEP7_Global_Results.undo()
Ejemplo n.º 18
0
def undo():
    Filefunctions.delete(Name_functions.results_folder())
    STEP8_Over_Time_Scoring.undo()
Ejemplo n.º 19
0
def metric_figure(metric):
    fn = results_folder() + '/Result_{}.png'.format(metric)
    Filefunctions.makeParentDir(fn)
    return fn
Ejemplo n.º 20
0
def parameter_evaluation_figure(parameter, metric):
    fn = parameter_metric_evaluation_folder(metric) + '/{}.png'.format(
        parameter)
    Filefunctions.makeParentDir(fn)
    return fn
Ejemplo n.º 21
0
def best_graec():
    fn = parameter_evaluation_folder() + '/best_graec_parameters.csv'
    Filefunctions.makeParentDir(fn)
    return fn
Ejemplo n.º 22
0
def parse_ms(s):
    print('\tGRAEC ... ', end='', flush=True)
    if Filefunctions.exists(Name_functions.S_GRAEC_enumeration_dictionary(s)):
        print('Already done')
        return

    enumeration_encoder = dict()

    fn_data = Name_functions.DS_file(s)
    fn_train_ids = Name_functions.DS_train_ids(s)
    fn_test_ids = Name_functions.DS_test_ids(s)
    x_train, labels_train, times_train, ids_train = DI(fn_data).get_data(
        fn_subset_ids=fn_train_ids,
        return_split_values=True,
        return_identifiers=True)
    x_test, labels_test, times_test, ids_test = DI(fn_data).get_data(
        fn_subset_ids=fn_test_ids,
        return_split_values=True,
        return_identifiers=True)

    enumeration = 0
    predictor = Classifiers.BPTSClassifier(s=s, score_function=None)

    for B in Parameters.GRAEC_beta:
        for T in Parameters.GRAEC_tau:
            for P in Parameters.GRAEC_p if not T == 0 else [
                    0
            ]:  # P has no use for T == 0
                enumeration_encoder[enumeration] = '{};{};{}'.format(B, T, P)
                predictor.set_scoring_function(
                    score_function=PeriodScoring(beta=B, p=P, tau=T, s=s))

                with open(
                        Name_functions.S_GRAEC_train_predictions(
                            s, enumeration), 'w+') as wf:
                    wf.write('SOID;time;True_label;Predicted_label\n')
                    for case_id, t, true_label in zip(ids_train, times_train,
                                                      labels_train):
                        predicted_label = predictor.predict(case_id=case_id,
                                                            time=t)
                        wf.write('{};{};{};{}\n'.format(
                            case_id, t, true_label[0], predicted_label))

                with open(
                        Name_functions.S_GRAEC_test_predictions(
                            s, enumeration), 'w+') as wf:
                    wf.write('Case_id;time;True_label;Predicted_label\n')
                    for case_id, t, true_label in zip(ids_test, times_test,
                                                      labels_test):
                        predicted_label = predictor.predict(case_id=case_id,
                                                            time=t)
                        wf.write('{};{};{};{}\n'.format(
                            case_id, t, true_label[0], predicted_label))

                enumeration += 1

    Human_Functions.save_dict_to_csv(
        enumeration_encoder, Name_functions.S_GRAEC_enumeration_dictionary(s))

    fn_data = Name_functions.DS_file(s)
    fn_ids = Name_functions.DS_test_ids(s)
    x, labels, times, ids = DI(fn_data).get_data(fn_subset_ids=fn_ids,
                                                 return_split_values=True,
                                                 return_identifiers=True)

    print('Done')
    print('\tNaive and Previous ... ', end='', flush=True)

    naive_predictor = Classifiers.NaiveClassifier(s)
    previous_predictor = Classifiers.PreviousClassifier(s)
    with open(Name_functions.S_naive_test_predictions(s), 'w+') as wf_naive:
        with open(Name_functions.S_recent_test_predictions(s),
                  'w+') as wf_previous:
            wf_naive.write('{};{};{};{}\n'.format('Case_id', 'time',
                                                  'True_label',
                                                  'Predicted_label'))
            wf_previous.write('{};{};{};{}\n'.format('Case_id', 'time',
                                                     'True_label',
                                                     'Predicted_label'))
            for case_id, t, true_label in zip(ids, times, labels):
                predicted_label_naive = naive_predictor.predict(
                    case_id=case_id, time=t)
                if predicted_label_naive is not None:
                    wf_naive.write('{};{};{};{}\n'.format(
                        case_id, t, true_label[0], predicted_label_naive))
                predicted_label_previous = previous_predictor.predict(
                    case_id=case_id, time=t)
                if predicted_label_previous is not None:
                    wf_previous.write('{};{};{};{}\n'.format(
                        case_id, t, true_label[0], predicted_label_previous))
    print('Done')
Ejemplo n.º 23
0
def full_GRAEC_table():
    fn = results_folder() + '/GRAEC Full Scores.csv'
    Filefunctions.makeParentDir(fn)
    return fn
Ejemplo n.º 24
0
def metric_table(metric):
    fn = results_folder() + '/GRAEC TexTable {}.txt'.format(metric)
    Filefunctions.makeParentDir(fn)
    return fn
Ejemplo n.º 25
0
def S_J_values(S):
    folder = S_folder(S) + '/Splits'
    if Filefunctions.exists(folder):
        return os.listdir(folder)
    else:
        return []
Ejemplo n.º 26
0
def parameter_evaluation_evaluation_metric_file(parameter):
    fn = parameter_evaluation_data_folder() + '/{}.csv'.format(parameter)
    Filefunctions.makeParentDir(fn)
    return fn
Ejemplo n.º 27
0
def S_GRAEC_enumeration_folder(S):
    fd = S_folder(S) + '/BEPT_Enumeration'
    Filefunctions.make_directory(fd)
    return fd
Ejemplo n.º 28
0
def undo():
    for S in Parameters.S_values:
        Filefunctions.delete(Name_functions.S_GRAEC_enumeration_folder(S))
        Filefunctions.delete(Name_functions.S_naive_test_predictions(S))
        Filefunctions.delete(Name_functions.S_recent_test_predictions(S))
    STEP6_Global_Scoring.undo()
Ejemplo n.º 29
0
def undo():
    Filefunctions.delete(Name_functions.parameter_evaluation_data_folder())
    STEP9_Over_Time_Results.undo()