Example #1
0
def create_and_evaluate_model(args):
    global trial_nr, all_results
    trial_nr += 1

    print("Trial %s out of %s" % (trial_nr, n_iter))

    start = time.time()
    score = 0

    for cv_iter in range(n_splits):

        # read encoded data
        dt_train = pd.read_csv(os.path.join(folds_dir,
                                            "fold%s_train.csv" % cv_iter),
                               sep=";")
        dt_test = pd.read_csv(os.path.join(folds_dir,
                                           "fold%s_test.csv" % cv_iter),
                              sep=";")

        with open(os.path.join(folds_dir, "fold%s_train_y.csv" % cv_iter),
                  "rb") as fin:
            train_y = np.array(pickle.load(fin))
        with open(os.path.join(folds_dir, "fold%s_test_y.csv" % cv_iter),
                  "rb") as fin:
            test_y = np.array(pickle.load(fin))

        # fit classifier and predict
        cls = ClassifierFactory.get_classifier(cls_method, args, random_state,
                                               min_cases_for_training,
                                               class_ratios[cv_iter])

        #print(set(get_types(dt_train)))
        if cls_method == 'catboost':
            cls.fit(
                dt_train, train_y,
                list(
                    dt_train.select_dtypes(
                        include=['object', 'category']).columns))
        else:
            cls.fit(dt_train, train_y)
        preds = cls.predict_proba(dt_test)

        if len(set(test_y)) >= 2:
            score += roc_auc_score(test_y, preds)

    # save current trial results
    for k, v in args.items():
        all_results.append((trial_nr, k, v, -1, score / n_splits))

    return {'loss': -score / n_splits, 'status': STATUS_OK, 'model': cls}
                    'case_id_col': dataset_manager.case_id_col,
                    'static_cat_cols': [],
                    'static_num_cols': [],
                    'dynamic_cat_cols': [],
                    'dynamic_num_cols': dynamic_text_cols,
                    'fillna': True
                }
                encoders.append(
                    (text_enc,
                     EncoderFactory.get_encoder(text_enc, **cls_encoder_args)))

            feature_combiner = FeatureUnion(encoders)

            # fit classifier and predict
            cls = ClassifierFactory.get_classifier(cls_method, cls_args,
                                                   random_state,
                                                   min_cases_for_training,
                                                   overall_class_ratio)

            if cls_method == "svm" or cls_method == "logit":
                pipeline = Pipeline([('encoder', feature_combiner),
                                     ('scaler', StandardScaler()),
                                     ('cls', cls)])
            else:
                pipeline = Pipeline([('encoder', feature_combiner),
                                     ('cls', cls)])

            start = time.time()
            pipeline.fit(dt_train_prefixes, train_y)
            time_train += time.time() - start

            del dt_train_prefixes
Example #3
0
                            knn_idxs].index
                        dt_train_bucket = dataset_manager.get_relevant_data_by_indexes(
                            dt_train_prefixes,
                            relevant_cases_bucket)  # one row per event
                        train_y = dataset_manager.get_label_numeric(
                            dt_train_bucket)

                        feature_combiner = FeatureUnion([
                            (method,
                             EncoderFactory.get_encoder(
                                 method, **cls_encoder_args))
                            for method in methods
                        ])
                        pipeline = Pipeline([('encoder', feature_combiner),
                                             ('cls',
                                              ClassifierFactory.get_classifier(
                                                  cls_method, **cls_args))])

                        # fit the classifier based on nearest neighbors
                        pipeline.fit(dt_train_bucket, train_y)

                        # select current test case
                        relevant_test_case = [encoded_test.index[i]]
                        dt_test_bucket = dataset_manager.get_relevant_data_by_indexes(
                            dt_test_prefixes, relevant_test_case)

                        # predict
                        test_y.extend(
                            dataset_manager.get_label_numeric(dt_test_bucket))
                        preds.extend(pipeline.predict_proba(dt_test_bucket))

                    if len(set(test_y)) < 2:
Example #4
0
        test_y = dataset_manager.get_label_numeric(dt_test_bucket)
            
        # add data about prefixes in this bucket (class labels and prefix lengths)
        nr_events_all.extend(list(dataset_manager.get_prefix_lengths(dt_test_bucket)))
        test_y_all.extend(test_y)

        # encode the prefixes
        feature_combiner = FeatureUnion([(method, EncoderFactory.get_encoder(method, **cls_encoder_args)) for method in methods])
        if "svm" in cls_method or "logit" in cls_method:
            feature_combiner = Pipeline([('encoder', feature_combiner), ('scaler', StandardScaler())])
            
        X_train = feature_combiner.fit_transform(dt_train_bucket)
        X_test = feature_combiner.transform(dt_test_bucket)

        # fit classifier and calibrate
        cls = ClassifierFactory.get_classifier(cls_method.replace("_calibrated", ""), current_args, random_state, min_cases_for_training, overall_class_ratio, binary=(False if "calibrate" in cls_method else True))
        cls.fit(X_train, train_y)

        if "calibrate" in cls_method:
            relevant_val_cases_bucket = dataset_manager.get_indexes(dt_val_prefixes)[bucket_assignments_val == bucket]
            dt_val_bucket = dataset_manager.get_relevant_data_by_indexes(dt_val_prefixes, relevant_val_cases_bucket)

            X_val = feature_combiner.transform(dt_val_bucket)
            y_val = dataset_manager.get_label_numeric(dt_val_bucket)
            
            cls = CalibratedClassifierCV(cls, cv="prefit", method='sigmoid')
            cls.fit(X_val, np.array(y_val))

        # predict 
        preds = cls.predict_proba(X_test)
        if "calibrate" in cls_method:
            
            # set optimal params for this bucket
            if bucket_method == "prefix":
                cls_args = {k:v for k,v in best_params[dataset_name][method_name][cls_method][bucket].items() if k not in ['n_clusters', 'n_neighbors']}
            else:
                cls_args = {k:v for k,v in best_params[dataset_name][method_name][cls_method].items() if k not in ['n_clusters', 'n_neighbors']}
            cls_args['random_state'] = random_state
            cls_args['min_cases_for_training'] = n_min_cases_in_bucket
        
            # select relevant cases
            relevant_cases_bucket = dataset_manager.get_indexes(dt_train_prefixes)[bucket_assignments_train == bucket]
            dt_train_bucket = dataset_manager.get_relevant_data_by_indexes(dt_train_prefixes, relevant_cases_bucket) # one row per event
            train_y = dataset_manager.get_label_numeric(dt_train_bucket)
            
            feature_combiner = FeatureUnion([(method, EncoderFactory.get_encoder(method, **cls_encoder_args)) for method in methods])
            pipelines[bucket] = Pipeline([('encoder', feature_combiner), ('cls', ClassifierFactory.get_classifier(cls_method, **cls_args))])
            
            pipelines[bucket].fit(dt_train_bucket, train_y)
            
            
        
        prefix_lengths_test = dt_test_prefixes.groupby(dataset_manager.case_id_col).size()
        
        # test separately for each prefix length
        for nr_events in range(min_prefix_length, max_prefix_length+1):
            print("Predicting for %s events..."%nr_events)

            # select only cases that are at least of length nr_events
            relevant_cases_nr_events = prefix_lengths_test[prefix_lengths_test == nr_events].index

            if len(relevant_cases_nr_events) == 0:
def create_and_evaluate_model(args):
    global trial_nr, all_results
    trial_nr += 1
    
    print("Trial %s out of %s" % (trial_nr, n_iter))
    
    start = time.time()
    score = 0
    
    cls_args = {k: v for k, v in args.items() if k in cls_params}
    text_transformer_args = {k: v for k, v in args.items() if k not in cls_params}
    cls_args['n_estimators'] = 500
    
    for cv_iter in range(n_splits):
        
        # read encoded data
        train_chunk = dataset_manager.read_fold(os.path.join(folds_dir, "fold%s_train.csv" % cv_iter))
        test_chunk = dataset_manager.read_fold(os.path.join(folds_dir, "fold%s_test.csv" % cv_iter))
        
        # fit text models and transform for each event
        if text_method in ["nb", "bong"]:
            if dataset_ref in ["crm2", "github"] and cls_method == "xgboost" and "single" in bucket_enc:
                if "index" in bucket_enc:
                    text_transformer_args["nr_selected"] = 100
                    cls_args['n_estimators'] = 200
                else:
                    text_transformer_args["nr_selected"] = 200
            else:
                text_transformer_args["nr_selected"] = 500
            if 'ngram_max' not in text_transformer_args:
                text_transformer_args['ngram_max'] = 1
            if text_method == "nb":
                text_transformer_args["pos_label"] = dataset_manager.pos_label
        elif text_method in ["pv", "lda"]:
            text_transformer_args["random_seed"] = 22
        if dataset_name in ["github"]:
            text_transformer_args["min_freq"] = 20
        elif dataset_name in ["crm2"]:
            text_transformer_args["min_freq"] = 20
        
        text_transformer = EncoderFactory.get_encoder(text_method, text_transformer_args=text_transformer_args)
        dt_train_text = text_transformer.fit_transform(train_chunk[dataset_manager.static_text_cols+dataset_manager.dynamic_text_cols], 
                                                       train_chunk[dataset_manager.label_col])
        
        static_text_cols = []
        dynamic_text_cols = []
        for col in dataset_manager.static_text_cols + dataset_manager.dynamic_text_cols:
            dt_train_text = text_transformer.transform(train_chunk[[col]], train_chunk[dataset_manager.label_col])
            current_text_cols = ["%s_%s" % (col, text_col) for text_col in dt_train_text.columns]
            dt_train_text.columns = current_text_cols
            dt_test_text = text_transformer.transform(test_chunk[[col]])
            dt_test_text.columns = current_text_cols
            train_chunk = pd.concat([train_chunk.drop(col, axis=1), dt_train_text], axis=1, sort=False)
            test_chunk = pd.concat([test_chunk.drop(col, axis=1), dt_test_text], axis=1, sort=False)
            if col in dataset_manager.static_text_cols:
                static_text_cols.extend(current_text_cols)
            else:
                dynamic_text_cols.extend(current_text_cols)
            del dt_train_text, dt_test_text
        
        # generate prefixes
        if nr_events is not None:
            dt_train_prefixes = dataset_manager.generate_prefix_data(train_chunk, nr_events, nr_events)
            dt_test_prefixes = dataset_manager.generate_prefix_data(test_chunk, nr_events, nr_events)
        else:
            dt_train_prefixes = dataset_manager.generate_prefix_data(train_chunk, min_prefix_length, max_prefix_length)
            dt_test_prefixes = dataset_manager.generate_prefix_data(test_chunk, min_prefix_length, max_prefix_length)
                
        train_y = dataset_manager.get_label_numeric(dt_train_prefixes)
        test_y = dataset_manager.get_label_numeric(dt_test_prefixes)
            
        # set up sequence encoders
        encoders = []
        for method in methods:
            if cls_encoding == text_enc:
                cls_encoder_args = {'case_id_col': dataset_manager.case_id_col, 
                    'static_cat_cols': dataset_manager.static_cat_cols,
                    'static_num_cols': dataset_manager.static_num_cols + static_text_cols, 
                    'dynamic_cat_cols': dataset_manager.dynamic_cat_cols,
                    'dynamic_num_cols': dataset_manager.dynamic_num_cols + dynamic_text_cols, 
                    'fillna': True}
            else:
                cls_encoder_args = {'case_id_col': dataset_manager.case_id_col, 
                    'static_cat_cols': dataset_manager.static_cat_cols,
                    'static_num_cols': dataset_manager.static_num_cols + static_text_cols, 
                    'dynamic_cat_cols': dataset_manager.dynamic_cat_cols,
                    'dynamic_num_cols': dataset_manager.dynamic_num_cols, 
                    'fillna': True}
            encoders.append((method, EncoderFactory.get_encoder(method, **cls_encoder_args)))
        if cls_encoding != text_enc and text_enc not in methods:
            cls_encoder_args = {'case_id_col': dataset_manager.case_id_col, 
                    'static_cat_cols': [],
                    'static_num_cols': [], 
                    'dynamic_cat_cols': [],
                    'dynamic_num_cols': dynamic_text_cols, 
                    'fillna': True}
            encoders.append((text_enc, EncoderFactory.get_encoder(text_enc, **cls_encoder_args)))
                
        feature_combiner = FeatureUnion(encoders)
        
        # fit classifier and predict
        cls = ClassifierFactory.get_classifier(cls_method, cls_args, random_state, min_cases_for_training, class_ratios[cv_iter])

        if cls_method == "svm" or cls_method == "logit":
            pipeline = Pipeline([('encoder', feature_combiner), ('scaler', StandardScaler()), ('cls', cls)])
        else:
            pipeline = Pipeline([('encoder', feature_combiner), ('cls', cls)])

        pipeline.fit(dt_train_prefixes, train_y)
        preds = pipeline.predict_proba(dt_test_prefixes)

        if len(set(test_y)) >= 2:
            score += roc_auc_score(test_y, preds)
    
    # save current trial results
    for k, v in cls_args.items():
        all_results.append((trial_nr, k, v, -1, score / n_splits))
    for k, v in text_transformer_args.items():
        all_results.append((trial_nr, k, v, -1, score / n_splits))

    return {'loss': -score / n_splits, 'status': STATUS_OK, 'model': cls}
        train_y = dataset_manager.get_label_numeric(dt_train_bucket)

        # extract data about prefixes in this bucket (class labels and prefix lengths)
        test_y = dataset_manager.get_label_numeric(dt_test_bucket)
        test_nr_events = list(
            dataset_manager.get_prefix_lengths(dt_test_bucket))
        test_case_ids = list(
            dt_test_bucket.groupby(dataset_manager.case_id_col).first().index)

        # initialize pipeline for sequence encoder and classifier
        feature_combiner = FeatureUnion([
            (method, EncoderFactory.get_encoder(method, **cls_encoder_args))
            for method in methods
        ])
        cls = ClassifierFactory.get_classifier(cls_method, params, None,
                                               min_cases_for_training,
                                               overall_class_ratio)
        pipeline = Pipeline([('encoder', feature_combiner), ('cls', cls)])

        # fit pipeline
        pipeline.fit(dt_train_bucket, train_y)

        # predict
        preds = pipeline.predict_proba(dt_test_bucket)

        dt_all_predictions = pd.concat([
            dt_all_predictions,
            pd.DataFrame({
                "predicted": preds,
                "actual": test_y,
                "case_id": test_case_ids,
Example #8
0
            print("Fitting pipeline for bucket %s..." % bucket)
            relevant_cases_bucket = dataset_manager.get_indexes(
                dt_train_prefixes)[bucket_assignments_train == bucket]
            dt_train_bucket = dataset_manager.get_relevant_data_by_indexes(
                dt_train_prefixes, relevant_cases_bucket)  # one row per event
            train_y = dataset_manager.get_label(dt_train_bucket,
                                                label_col=label_col)

            feature_combiner = FeatureUnion([
                (method, EncoderFactory.get_encoder(method,
                                                    **cls_encoder_args))
                for method in methods
            ])
            pipelines[bucket] = Pipeline([
                ('encoder', feature_combiner),
                ('cls', ClassifierFactory.get_classifier(cls_method, **params))
            ])

            pipelines[bucket].fit(dt_train_bucket, train_y)

        # if the bucketing is prefix-length-based, then evaluate for each prefix length separately, otherwise evaluate all prefixes together
        max_evaluation_prefix_length = max_prefix_length if bucket_method == "prefix" else min_prefix_length

        prefix_lengths_test = dt_test_prefixes.groupby(
            dataset_manager.case_id_col).size()

        # test separately for each prefix length
        for nr_events in range(min_prefix_length,
                               max_evaluation_prefix_length + 1):
            print("Predicting for %s events..." % nr_events)
def create_and_evaluate_model(args):
    global trial_nr, all_results, n_runs, alpha, beta
    trial_nr += 1

    print("Trial %s out of %s" % (trial_nr, n_iter))

    args['n_estimators'] = 500

    score_auc = 0
    preds_all = pd.DataFrame()
    for cv_iter in range(n_splits):

        # read encoded data
        dt_train = pd.read_csv(os.path.join(folds_dir,
                                            "fold%s_train.csv" % cv_iter),
                               sep=";")
        dt_test = pd.read_csv(os.path.join(folds_dir,
                                           "fold%s_test.csv" % cv_iter),
                              sep=";")
        dt_test = dt_test.fillna(0)

        with open(os.path.join(folds_dir, "fold%s_train_y.csv" % cv_iter),
                  "rb") as fin:
            train_y = np.array(pickle.load(fin))
        with open(os.path.join(folds_dir, "fold%s_test_y.csv" % cv_iter),
                  "rb") as fin:
            test_y = np.array(pickle.load(fin))

        for current_run in range(n_runs):
            # fit classifier and predict
            cls = ClassifierFactory.get_classifier(cls_method, args, None,
                                                   min_cases_for_training,
                                                   class_ratios[cv_iter])
            cls.fit(dt_train, train_y)
            print(dt_test.isnull().values.any())
            preds = cls.predict_proba(dt_test)
            preds_all = pd.concat([
                preds_all,
                pd.DataFrame({
                    'predicted': preds,
                    'run': current_run,
                    'idx': range(len(preds))
                })
            ],
                                  axis=0,
                                  sort=False)

            score_auc += roc_auc_score(test_y, preds)

    score_auc = score_auc / n_splits / n_runs

    mspd_acc = 0
    for i in range(n_runs):
        tmp1 = preds_all[preds_all.run == i]
        for j in range(i):
            tmp2 = preds_all[preds_all.run == j]
            tmp_merged = tmp1.merge(tmp2, on=["idx"])

            mspd_acc += 2.0 / (n_runs * (n_runs - 1)) * np.mean(
                np.power(tmp_merged.predicted_x - tmp_merged.predicted_y, 2))

    score = alpha * score_auc - beta * np.sqrt(mspd_acc)

    # save current trial results
    for k, v in args.items():
        all_results.append(
            (trial_nr, k, v, -1, score_auc, np.sqrt(mspd_acc), score))

    return {'loss': -score, 'status': STATUS_OK}
Example #10
0
def create_and_evaluate_model(args):
    global trial_nr, all_results
    trial_nr += 1
    
    print("Trial %s out of %s" % (trial_nr, n_iter))
    
    start = time.time()
    score = 0

    for cv_iter in range(n_splits):

        if cls_encoding == "waveletLast" or cls_encoding == "waveletAgg" or cls_encoding == "waveletIndex":
            # read encoded data
            dt_train_last = pd.read_csv(os.path.join(folds_dir, "fold%s_train_last.csv" % cv_iter), sep=";")
            dt_test_last = pd.read_csv(os.path.join(folds_dir, "fold%s_test_last.csv" % cv_iter), sep=";")

            dt_train_wavelet = pd.read_csv(os.path.join(folds_dir, "fold%s_train_wavelet.csv" % cv_iter), sep=";")
            dt_test_wavelet = pd.read_csv(os.path.join(folds_dir, "fold%s_test_wavelet.csv" % cv_iter), sep=";")

            dt_train = add_features(dt_train_wavelet, dt_train_last,numberOfFeatures=10)
            dt_train.columns = list(range(dt_train.shape[1]))
            dt_test = add_features(dt_test_wavelet, dt_test_last, numberOfFeatures=10)
            dt_test.columns = list(range(dt_test.shape[1]))




            with open(os.path.join(folds_dir, "fold%s_train_y.csv" % cv_iter), "rb") as fin:
                train_y = np.array(pickle.load(fin))
            with open(os.path.join(folds_dir, "fold%s_test_y.csv" % cv_iter), "rb") as fin:
                test_y = np.array(pickle.load(fin))

            # fit classifier and predict
            cls = ClassifierFactory.get_classifier(cls_method, args, random_state, min_cases_for_training,
                                                   class_ratios[cv_iter])

            # print(set(get_types(dt_train)))
            if cls_method == 'catboost':
                with open('outfile' + '_' + cls_method + '_' + cls_encoding, 'wb') as fp:
                    pickle.dump(get_types(dt_train), fp)

                cls.fit(dt_train, train_y, list(dt_train.select_dtypes(include=['object', 'category']).columns))
            else:

                if cls_method == 'svm' or cls_method == 'logit' or cls_method == 'rf':
                    pass
                    dt_train.replace([np.inf, -np.inf], np.nan, inplace=True)
                    dt_train.fillna(0, inplace=True)
                    dt_test.replace([np.inf, -np.inf], np.nan, inplace=True)
                    dt_test.fillna(0, inplace=True)
                    cls.fit(dt_train,train_y)

                else:
                    cls.fit(dt_train, train_y)
            preds = cls.predict_proba(dt_test)

            if len(set(test_y)) >= 2:
                score += roc_auc_score(test_y, preds)
        else:

            # read encoded data
            dt_train = pd.read_csv(os.path.join(folds_dir, "fold%s_train.csv" % cv_iter), sep=";")
            #print(dt_train.info())
            dt_test = pd.read_csv(os.path.join(folds_dir, "fold%s_test.csv" % cv_iter), sep=";")

            with open(os.path.join(folds_dir, "fold%s_train_y.csv" % cv_iter), "rb") as fin:
                train_y = np.array(pickle.load(fin))
            with open(os.path.join(folds_dir, "fold%s_test_y.csv" % cv_iter), "rb") as fin:
                test_y = np.array(pickle.load(fin))

            # fit classifier and predict
            cls = ClassifierFactory.get_classifier(cls_method, args, random_state, min_cases_for_training,
                                                   class_ratios[cv_iter])
            #print(set(get_types(dt_train)))
            if cls_method=='catboost':

                cls.fit(dt_train, train_y, list(dt_train.select_dtypes(include=['object', 'category']).columns))
            else:
                if cls_method == 'svm' or cls_method == 'logit' or cls_method == 'rf':
                    pass
                    dt_train.replace([np.inf, -np.inf], np.nan, inplace=True)
                    dt_train.fillna(0, inplace=True)
                    dt_test.replace([np.inf, -np.inf], np.nan, inplace=True)
                    dt_test.fillna(0, inplace=True)
                    # dt_test = dt_test.astype((np.float))
                    cls.fit(dt_train, train_y)
                else:
                    cls.fit(dt_train, train_y)

                #cls.fit(dt_train, train_y)

            preds = cls.predict_proba(dt_test)

            if len(set(test_y)) >= 2:
                score += roc_auc_score(test_y, preds)
    
    # save current trial results
    for k, v in args.items():
        all_results.append((trial_nr, k, v, -1, score / n_splits))

    return {'loss': -score / n_splits, 'status': STATUS_OK, 'model': cls}
                    cls_args = best_params[key_id][method_name][cls_method]
                cls_args['mode'] = mode
                cls_args['random_state'] = random_state
                cls_args['min_cases_for_training'] = n_min_cases_in_bucket
                #print("Cls params are: %s" % str(list(cls_args.values())))

                # select relevant cases
                relevant_cases_bucket = dataset_manager.get_indexes(dt_train_prefixes)[bucket_assignments_train == bucket]
                dt_train_bucket = dataset_manager.get_relevant_data_by_indexes(dt_train_prefixes,
                                                                               relevant_cases_bucket)  # one row per event
                train_y = dataset_manager.get_label(dt_train_bucket, label_col=label_col, mode=mode)

                feature_combiner = FeatureUnion(
                    [(method, EncoderFactory.get_encoder(method, **cls_encoder_args)) for method in methods])
                pipelines[bucket] = Pipeline(
                    [('encoder', feature_combiner), ('cls', ClassifierFactory.get_classifier(cls_method, **cls_args))])

                pipelines[bucket].fit(dt_train_bucket, train_y)

                # feature_set = [] if self.hardcoded_prediction is not None:
                # for feature_set_this_encoding in pipelines[bucket].steps[0][1].transformer_list:
                #     for feature in feature_set_this_encoding[1].columns.tolist():
                #         feature_set.append(feature)
                #
                # feats = {}  # a dict to hold feature_name: feature_importance
                # for feature, importance in zip(feature_set, pipelines[bucket].named_steps.cls.cls.feature_importances_):
                #     feats[feature] = importance  # add the name/value pair
                #
                # importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'})
                # importances = importances.sort_values(by='Gini-importance', ascending=False)
                # importances.to_csv(os.path.join(home_dir, feature_importance_dir, "feat_importance_%s_%s_%s_%s_%s.csv" %