def create_and_evaluate_model(args): global trial_nr, all_results trial_nr += 1 print("Trial %s out of %s" % (trial_nr, n_iter)) start = time.time() score = 0 for cv_iter in range(n_splits): # read encoded data dt_train = pd.read_csv(os.path.join(folds_dir, "fold%s_train.csv" % cv_iter), sep=";") dt_test = pd.read_csv(os.path.join(folds_dir, "fold%s_test.csv" % cv_iter), sep=";") with open(os.path.join(folds_dir, "fold%s_train_y.csv" % cv_iter), "rb") as fin: train_y = np.array(pickle.load(fin)) with open(os.path.join(folds_dir, "fold%s_test_y.csv" % cv_iter), "rb") as fin: test_y = np.array(pickle.load(fin)) # fit classifier and predict cls = ClassifierFactory.get_classifier(cls_method, args, random_state, min_cases_for_training, class_ratios[cv_iter]) #print(set(get_types(dt_train))) if cls_method == 'catboost': cls.fit( dt_train, train_y, list( dt_train.select_dtypes( include=['object', 'category']).columns)) else: cls.fit(dt_train, train_y) preds = cls.predict_proba(dt_test) if len(set(test_y)) >= 2: score += roc_auc_score(test_y, preds) # save current trial results for k, v in args.items(): all_results.append((trial_nr, k, v, -1, score / n_splits)) return {'loss': -score / n_splits, 'status': STATUS_OK, 'model': cls}
'case_id_col': dataset_manager.case_id_col, 'static_cat_cols': [], 'static_num_cols': [], 'dynamic_cat_cols': [], 'dynamic_num_cols': dynamic_text_cols, 'fillna': True } encoders.append( (text_enc, EncoderFactory.get_encoder(text_enc, **cls_encoder_args))) feature_combiner = FeatureUnion(encoders) # fit classifier and predict cls = ClassifierFactory.get_classifier(cls_method, cls_args, random_state, min_cases_for_training, overall_class_ratio) if cls_method == "svm" or cls_method == "logit": pipeline = Pipeline([('encoder', feature_combiner), ('scaler', StandardScaler()), ('cls', cls)]) else: pipeline = Pipeline([('encoder', feature_combiner), ('cls', cls)]) start = time.time() pipeline.fit(dt_train_prefixes, train_y) time_train += time.time() - start del dt_train_prefixes
knn_idxs].index dt_train_bucket = dataset_manager.get_relevant_data_by_indexes( dt_train_prefixes, relevant_cases_bucket) # one row per event train_y = dataset_manager.get_label_numeric( dt_train_bucket) feature_combiner = FeatureUnion([ (method, EncoderFactory.get_encoder( method, **cls_encoder_args)) for method in methods ]) pipeline = Pipeline([('encoder', feature_combiner), ('cls', ClassifierFactory.get_classifier( cls_method, **cls_args))]) # fit the classifier based on nearest neighbors pipeline.fit(dt_train_bucket, train_y) # select current test case relevant_test_case = [encoded_test.index[i]] dt_test_bucket = dataset_manager.get_relevant_data_by_indexes( dt_test_prefixes, relevant_test_case) # predict test_y.extend( dataset_manager.get_label_numeric(dt_test_bucket)) preds.extend(pipeline.predict_proba(dt_test_bucket)) if len(set(test_y)) < 2:
test_y = dataset_manager.get_label_numeric(dt_test_bucket) # add data about prefixes in this bucket (class labels and prefix lengths) nr_events_all.extend(list(dataset_manager.get_prefix_lengths(dt_test_bucket))) test_y_all.extend(test_y) # encode the prefixes feature_combiner = FeatureUnion([(method, EncoderFactory.get_encoder(method, **cls_encoder_args)) for method in methods]) if "svm" in cls_method or "logit" in cls_method: feature_combiner = Pipeline([('encoder', feature_combiner), ('scaler', StandardScaler())]) X_train = feature_combiner.fit_transform(dt_train_bucket) X_test = feature_combiner.transform(dt_test_bucket) # fit classifier and calibrate cls = ClassifierFactory.get_classifier(cls_method.replace("_calibrated", ""), current_args, random_state, min_cases_for_training, overall_class_ratio, binary=(False if "calibrate" in cls_method else True)) cls.fit(X_train, train_y) if "calibrate" in cls_method: relevant_val_cases_bucket = dataset_manager.get_indexes(dt_val_prefixes)[bucket_assignments_val == bucket] dt_val_bucket = dataset_manager.get_relevant_data_by_indexes(dt_val_prefixes, relevant_val_cases_bucket) X_val = feature_combiner.transform(dt_val_bucket) y_val = dataset_manager.get_label_numeric(dt_val_bucket) cls = CalibratedClassifierCV(cls, cv="prefit", method='sigmoid') cls.fit(X_val, np.array(y_val)) # predict preds = cls.predict_proba(X_test) if "calibrate" in cls_method:
# set optimal params for this bucket if bucket_method == "prefix": cls_args = {k:v for k,v in best_params[dataset_name][method_name][cls_method][bucket].items() if k not in ['n_clusters', 'n_neighbors']} else: cls_args = {k:v for k,v in best_params[dataset_name][method_name][cls_method].items() if k not in ['n_clusters', 'n_neighbors']} cls_args['random_state'] = random_state cls_args['min_cases_for_training'] = n_min_cases_in_bucket # select relevant cases relevant_cases_bucket = dataset_manager.get_indexes(dt_train_prefixes)[bucket_assignments_train == bucket] dt_train_bucket = dataset_manager.get_relevant_data_by_indexes(dt_train_prefixes, relevant_cases_bucket) # one row per event train_y = dataset_manager.get_label_numeric(dt_train_bucket) feature_combiner = FeatureUnion([(method, EncoderFactory.get_encoder(method, **cls_encoder_args)) for method in methods]) pipelines[bucket] = Pipeline([('encoder', feature_combiner), ('cls', ClassifierFactory.get_classifier(cls_method, **cls_args))]) pipelines[bucket].fit(dt_train_bucket, train_y) prefix_lengths_test = dt_test_prefixes.groupby(dataset_manager.case_id_col).size() # test separately for each prefix length for nr_events in range(min_prefix_length, max_prefix_length+1): print("Predicting for %s events..."%nr_events) # select only cases that are at least of length nr_events relevant_cases_nr_events = prefix_lengths_test[prefix_lengths_test == nr_events].index if len(relevant_cases_nr_events) == 0:
def create_and_evaluate_model(args): global trial_nr, all_results trial_nr += 1 print("Trial %s out of %s" % (trial_nr, n_iter)) start = time.time() score = 0 cls_args = {k: v for k, v in args.items() if k in cls_params} text_transformer_args = {k: v for k, v in args.items() if k not in cls_params} cls_args['n_estimators'] = 500 for cv_iter in range(n_splits): # read encoded data train_chunk = dataset_manager.read_fold(os.path.join(folds_dir, "fold%s_train.csv" % cv_iter)) test_chunk = dataset_manager.read_fold(os.path.join(folds_dir, "fold%s_test.csv" % cv_iter)) # fit text models and transform for each event if text_method in ["nb", "bong"]: if dataset_ref in ["crm2", "github"] and cls_method == "xgboost" and "single" in bucket_enc: if "index" in bucket_enc: text_transformer_args["nr_selected"] = 100 cls_args['n_estimators'] = 200 else: text_transformer_args["nr_selected"] = 200 else: text_transformer_args["nr_selected"] = 500 if 'ngram_max' not in text_transformer_args: text_transformer_args['ngram_max'] = 1 if text_method == "nb": text_transformer_args["pos_label"] = dataset_manager.pos_label elif text_method in ["pv", "lda"]: text_transformer_args["random_seed"] = 22 if dataset_name in ["github"]: text_transformer_args["min_freq"] = 20 elif dataset_name in ["crm2"]: text_transformer_args["min_freq"] = 20 text_transformer = EncoderFactory.get_encoder(text_method, text_transformer_args=text_transformer_args) dt_train_text = text_transformer.fit_transform(train_chunk[dataset_manager.static_text_cols+dataset_manager.dynamic_text_cols], train_chunk[dataset_manager.label_col]) static_text_cols = [] dynamic_text_cols = [] for col in dataset_manager.static_text_cols + dataset_manager.dynamic_text_cols: dt_train_text = text_transformer.transform(train_chunk[[col]], train_chunk[dataset_manager.label_col]) current_text_cols = ["%s_%s" % (col, text_col) for text_col in dt_train_text.columns] dt_train_text.columns = current_text_cols dt_test_text = text_transformer.transform(test_chunk[[col]]) dt_test_text.columns = current_text_cols train_chunk = pd.concat([train_chunk.drop(col, axis=1), dt_train_text], axis=1, sort=False) test_chunk = pd.concat([test_chunk.drop(col, axis=1), dt_test_text], axis=1, sort=False) if col in dataset_manager.static_text_cols: static_text_cols.extend(current_text_cols) else: dynamic_text_cols.extend(current_text_cols) del dt_train_text, dt_test_text # generate prefixes if nr_events is not None: dt_train_prefixes = dataset_manager.generate_prefix_data(train_chunk, nr_events, nr_events) dt_test_prefixes = dataset_manager.generate_prefix_data(test_chunk, nr_events, nr_events) else: dt_train_prefixes = dataset_manager.generate_prefix_data(train_chunk, min_prefix_length, max_prefix_length) dt_test_prefixes = dataset_manager.generate_prefix_data(test_chunk, min_prefix_length, max_prefix_length) train_y = dataset_manager.get_label_numeric(dt_train_prefixes) test_y = dataset_manager.get_label_numeric(dt_test_prefixes) # set up sequence encoders encoders = [] for method in methods: if cls_encoding == text_enc: cls_encoder_args = {'case_id_col': dataset_manager.case_id_col, 'static_cat_cols': dataset_manager.static_cat_cols, 'static_num_cols': dataset_manager.static_num_cols + static_text_cols, 'dynamic_cat_cols': dataset_manager.dynamic_cat_cols, 'dynamic_num_cols': dataset_manager.dynamic_num_cols + dynamic_text_cols, 'fillna': True} else: cls_encoder_args = {'case_id_col': dataset_manager.case_id_col, 'static_cat_cols': dataset_manager.static_cat_cols, 'static_num_cols': dataset_manager.static_num_cols + static_text_cols, 'dynamic_cat_cols': dataset_manager.dynamic_cat_cols, 'dynamic_num_cols': dataset_manager.dynamic_num_cols, 'fillna': True} encoders.append((method, EncoderFactory.get_encoder(method, **cls_encoder_args))) if cls_encoding != text_enc and text_enc not in methods: cls_encoder_args = {'case_id_col': dataset_manager.case_id_col, 'static_cat_cols': [], 'static_num_cols': [], 'dynamic_cat_cols': [], 'dynamic_num_cols': dynamic_text_cols, 'fillna': True} encoders.append((text_enc, EncoderFactory.get_encoder(text_enc, **cls_encoder_args))) feature_combiner = FeatureUnion(encoders) # fit classifier and predict cls = ClassifierFactory.get_classifier(cls_method, cls_args, random_state, min_cases_for_training, class_ratios[cv_iter]) if cls_method == "svm" or cls_method == "logit": pipeline = Pipeline([('encoder', feature_combiner), ('scaler', StandardScaler()), ('cls', cls)]) else: pipeline = Pipeline([('encoder', feature_combiner), ('cls', cls)]) pipeline.fit(dt_train_prefixes, train_y) preds = pipeline.predict_proba(dt_test_prefixes) if len(set(test_y)) >= 2: score += roc_auc_score(test_y, preds) # save current trial results for k, v in cls_args.items(): all_results.append((trial_nr, k, v, -1, score / n_splits)) for k, v in text_transformer_args.items(): all_results.append((trial_nr, k, v, -1, score / n_splits)) return {'loss': -score / n_splits, 'status': STATUS_OK, 'model': cls}
train_y = dataset_manager.get_label_numeric(dt_train_bucket) # extract data about prefixes in this bucket (class labels and prefix lengths) test_y = dataset_manager.get_label_numeric(dt_test_bucket) test_nr_events = list( dataset_manager.get_prefix_lengths(dt_test_bucket)) test_case_ids = list( dt_test_bucket.groupby(dataset_manager.case_id_col).first().index) # initialize pipeline for sequence encoder and classifier feature_combiner = FeatureUnion([ (method, EncoderFactory.get_encoder(method, **cls_encoder_args)) for method in methods ]) cls = ClassifierFactory.get_classifier(cls_method, params, None, min_cases_for_training, overall_class_ratio) pipeline = Pipeline([('encoder', feature_combiner), ('cls', cls)]) # fit pipeline pipeline.fit(dt_train_bucket, train_y) # predict preds = pipeline.predict_proba(dt_test_bucket) dt_all_predictions = pd.concat([ dt_all_predictions, pd.DataFrame({ "predicted": preds, "actual": test_y, "case_id": test_case_ids,
print("Fitting pipeline for bucket %s..." % bucket) relevant_cases_bucket = dataset_manager.get_indexes( dt_train_prefixes)[bucket_assignments_train == bucket] dt_train_bucket = dataset_manager.get_relevant_data_by_indexes( dt_train_prefixes, relevant_cases_bucket) # one row per event train_y = dataset_manager.get_label(dt_train_bucket, label_col=label_col) feature_combiner = FeatureUnion([ (method, EncoderFactory.get_encoder(method, **cls_encoder_args)) for method in methods ]) pipelines[bucket] = Pipeline([ ('encoder', feature_combiner), ('cls', ClassifierFactory.get_classifier(cls_method, **params)) ]) pipelines[bucket].fit(dt_train_bucket, train_y) # if the bucketing is prefix-length-based, then evaluate for each prefix length separately, otherwise evaluate all prefixes together max_evaluation_prefix_length = max_prefix_length if bucket_method == "prefix" else min_prefix_length prefix_lengths_test = dt_test_prefixes.groupby( dataset_manager.case_id_col).size() # test separately for each prefix length for nr_events in range(min_prefix_length, max_evaluation_prefix_length + 1): print("Predicting for %s events..." % nr_events)
def create_and_evaluate_model(args): global trial_nr, all_results, n_runs, alpha, beta trial_nr += 1 print("Trial %s out of %s" % (trial_nr, n_iter)) args['n_estimators'] = 500 score_auc = 0 preds_all = pd.DataFrame() for cv_iter in range(n_splits): # read encoded data dt_train = pd.read_csv(os.path.join(folds_dir, "fold%s_train.csv" % cv_iter), sep=";") dt_test = pd.read_csv(os.path.join(folds_dir, "fold%s_test.csv" % cv_iter), sep=";") dt_test = dt_test.fillna(0) with open(os.path.join(folds_dir, "fold%s_train_y.csv" % cv_iter), "rb") as fin: train_y = np.array(pickle.load(fin)) with open(os.path.join(folds_dir, "fold%s_test_y.csv" % cv_iter), "rb") as fin: test_y = np.array(pickle.load(fin)) for current_run in range(n_runs): # fit classifier and predict cls = ClassifierFactory.get_classifier(cls_method, args, None, min_cases_for_training, class_ratios[cv_iter]) cls.fit(dt_train, train_y) print(dt_test.isnull().values.any()) preds = cls.predict_proba(dt_test) preds_all = pd.concat([ preds_all, pd.DataFrame({ 'predicted': preds, 'run': current_run, 'idx': range(len(preds)) }) ], axis=0, sort=False) score_auc += roc_auc_score(test_y, preds) score_auc = score_auc / n_splits / n_runs mspd_acc = 0 for i in range(n_runs): tmp1 = preds_all[preds_all.run == i] for j in range(i): tmp2 = preds_all[preds_all.run == j] tmp_merged = tmp1.merge(tmp2, on=["idx"]) mspd_acc += 2.0 / (n_runs * (n_runs - 1)) * np.mean( np.power(tmp_merged.predicted_x - tmp_merged.predicted_y, 2)) score = alpha * score_auc - beta * np.sqrt(mspd_acc) # save current trial results for k, v in args.items(): all_results.append( (trial_nr, k, v, -1, score_auc, np.sqrt(mspd_acc), score)) return {'loss': -score, 'status': STATUS_OK}
def create_and_evaluate_model(args): global trial_nr, all_results trial_nr += 1 print("Trial %s out of %s" % (trial_nr, n_iter)) start = time.time() score = 0 for cv_iter in range(n_splits): if cls_encoding == "waveletLast" or cls_encoding == "waveletAgg" or cls_encoding == "waveletIndex": # read encoded data dt_train_last = pd.read_csv(os.path.join(folds_dir, "fold%s_train_last.csv" % cv_iter), sep=";") dt_test_last = pd.read_csv(os.path.join(folds_dir, "fold%s_test_last.csv" % cv_iter), sep=";") dt_train_wavelet = pd.read_csv(os.path.join(folds_dir, "fold%s_train_wavelet.csv" % cv_iter), sep=";") dt_test_wavelet = pd.read_csv(os.path.join(folds_dir, "fold%s_test_wavelet.csv" % cv_iter), sep=";") dt_train = add_features(dt_train_wavelet, dt_train_last,numberOfFeatures=10) dt_train.columns = list(range(dt_train.shape[1])) dt_test = add_features(dt_test_wavelet, dt_test_last, numberOfFeatures=10) dt_test.columns = list(range(dt_test.shape[1])) with open(os.path.join(folds_dir, "fold%s_train_y.csv" % cv_iter), "rb") as fin: train_y = np.array(pickle.load(fin)) with open(os.path.join(folds_dir, "fold%s_test_y.csv" % cv_iter), "rb") as fin: test_y = np.array(pickle.load(fin)) # fit classifier and predict cls = ClassifierFactory.get_classifier(cls_method, args, random_state, min_cases_for_training, class_ratios[cv_iter]) # print(set(get_types(dt_train))) if cls_method == 'catboost': with open('outfile' + '_' + cls_method + '_' + cls_encoding, 'wb') as fp: pickle.dump(get_types(dt_train), fp) cls.fit(dt_train, train_y, list(dt_train.select_dtypes(include=['object', 'category']).columns)) else: if cls_method == 'svm' or cls_method == 'logit' or cls_method == 'rf': pass dt_train.replace([np.inf, -np.inf], np.nan, inplace=True) dt_train.fillna(0, inplace=True) dt_test.replace([np.inf, -np.inf], np.nan, inplace=True) dt_test.fillna(0, inplace=True) cls.fit(dt_train,train_y) else: cls.fit(dt_train, train_y) preds = cls.predict_proba(dt_test) if len(set(test_y)) >= 2: score += roc_auc_score(test_y, preds) else: # read encoded data dt_train = pd.read_csv(os.path.join(folds_dir, "fold%s_train.csv" % cv_iter), sep=";") #print(dt_train.info()) dt_test = pd.read_csv(os.path.join(folds_dir, "fold%s_test.csv" % cv_iter), sep=";") with open(os.path.join(folds_dir, "fold%s_train_y.csv" % cv_iter), "rb") as fin: train_y = np.array(pickle.load(fin)) with open(os.path.join(folds_dir, "fold%s_test_y.csv" % cv_iter), "rb") as fin: test_y = np.array(pickle.load(fin)) # fit classifier and predict cls = ClassifierFactory.get_classifier(cls_method, args, random_state, min_cases_for_training, class_ratios[cv_iter]) #print(set(get_types(dt_train))) if cls_method=='catboost': cls.fit(dt_train, train_y, list(dt_train.select_dtypes(include=['object', 'category']).columns)) else: if cls_method == 'svm' or cls_method == 'logit' or cls_method == 'rf': pass dt_train.replace([np.inf, -np.inf], np.nan, inplace=True) dt_train.fillna(0, inplace=True) dt_test.replace([np.inf, -np.inf], np.nan, inplace=True) dt_test.fillna(0, inplace=True) # dt_test = dt_test.astype((np.float)) cls.fit(dt_train, train_y) else: cls.fit(dt_train, train_y) #cls.fit(dt_train, train_y) preds = cls.predict_proba(dt_test) if len(set(test_y)) >= 2: score += roc_auc_score(test_y, preds) # save current trial results for k, v in args.items(): all_results.append((trial_nr, k, v, -1, score / n_splits)) return {'loss': -score / n_splits, 'status': STATUS_OK, 'model': cls}
cls_args = best_params[key_id][method_name][cls_method] cls_args['mode'] = mode cls_args['random_state'] = random_state cls_args['min_cases_for_training'] = n_min_cases_in_bucket #print("Cls params are: %s" % str(list(cls_args.values()))) # select relevant cases relevant_cases_bucket = dataset_manager.get_indexes(dt_train_prefixes)[bucket_assignments_train == bucket] dt_train_bucket = dataset_manager.get_relevant_data_by_indexes(dt_train_prefixes, relevant_cases_bucket) # one row per event train_y = dataset_manager.get_label(dt_train_bucket, label_col=label_col, mode=mode) feature_combiner = FeatureUnion( [(method, EncoderFactory.get_encoder(method, **cls_encoder_args)) for method in methods]) pipelines[bucket] = Pipeline( [('encoder', feature_combiner), ('cls', ClassifierFactory.get_classifier(cls_method, **cls_args))]) pipelines[bucket].fit(dt_train_bucket, train_y) # feature_set = [] if self.hardcoded_prediction is not None: # for feature_set_this_encoding in pipelines[bucket].steps[0][1].transformer_list: # for feature in feature_set_this_encoding[1].columns.tolist(): # feature_set.append(feature) # # feats = {} # a dict to hold feature_name: feature_importance # for feature, importance in zip(feature_set, pipelines[bucket].named_steps.cls.cls.feature_importances_): # feats[feature] = importance # add the name/value pair # # importances = pd.DataFrame.from_dict(feats, orient='index').rename(columns={0: 'Gini-importance'}) # importances = importances.sort_values(by='Gini-importance', ascending=False) # importances.to_csv(os.path.join(home_dir, feature_importance_dir, "feat_importance_%s_%s_%s_%s_%s.csv" %