for i_split, (train_set, _) in enumerate(rd_split.split(range(len(rts)))): print("Process split: %d/%d." % (i_split + 1, rd_split.get_n_splits())) n_spectra.append(len(train_set)) # Shuffle split does not preserve the order of the examples when sub-setting. train_set = np.sort(train_set) rts_train = rts[train_set] wtx_train = wtx[train_set] pairs, _ = get_pairs_single_system(rts_train, d_lower=0, d_upper=np.inf, return_rt_differences=True) # Calculate the pairwise accuracy score = 0.0 for i, j in pairs: if wtx_train[i] < wtx_train[j]: score += 1.0 if len(pairs) > 0: score /= len(pairs) print("Kendall tau=%f, Spearmanr=%f, pairwise acc=%f" % (sp.stats.kendalltau(wtx_train, rts_train)[0], sp.stats.spearmanr(wtx_train, rts_train)[0], score)) # 3) Perform the reranking of the candidates
def test_equal_to_simple_function_in_single_system_case(self): cretention = retention_cls() # ---------------------------------------------- d_target = OrderedDict([(("M1", "A"), 10), (("M2", "A"), 4), (("M3", "A"), 6), (("M4", "A"), 8), (("M5", "A"), 2)]) keys = list(d_target.keys()) cretention.load_data_from_target(d_target) cretention.make_digraph() cretention.dmolecules_inv = cretention.invert_dictionary( cretention.dmolecules) cretention.dcollections_inv = cretention.invert_dictionary( cretention.dcollections) d_pairs_ref = { 0: [], 1: [(4, 1), (1, 2), (2, 3), (3, 0)], 2: [(4, 1), (1, 2), (2, 3), (3, 0), (4, 2), (1, 3), (2, 0)], 3: [(4, 1), (1, 2), (2, 3), (3, 0), (4, 2), (1, 3), (2, 0), (4, 3), (1, 0)], 4: [(4, 1), (1, 2), (2, 3), (3, 0), (4, 2), (1, 3), (2, 0), (4, 3), (1, 0), (4, 0)] } for d in d_pairs_ref.keys(): pairs_og = get_pairs_from_order_graph(cretention, keys, allow_overlap=True, d_lower=0, d_upper=d) pairs = get_pairs_single_system(list(d_target.values()), d_lower=0, d_upper=d) self.assertEqual(len(pairs_og), len(d_pairs_ref[d])) self.assertEqual(len(pairs), len(d_pairs_ref[d])) for pair in d_pairs_ref[d]: self.assertIn(pair, pairs_og) self.assertIn(pair, pairs) # ---------------------------------------------- d_target = OrderedDict([(("M1", "A"), 10), (("M2", "A"), 4), (("M3", "A"), 6), (("M4", "A"), 8), (("M5", "A"), 2)]) keys = list(d_target.keys()) cretention.load_data_from_target(d_target) cretention.make_digraph() cretention.dmolecules_inv = cretention.invert_dictionary( cretention.dmolecules) cretention.dcollections_inv = cretention.invert_dictionary( cretention.dcollections) d_pairs_ref = { 5: [], 4: [(4, 0)], 3: [(4, 0), (4, 3), (1, 0)], 2: [(4, 0), (4, 3), (1, 0), (4, 2), (1, 3), (2, 0)], 1: [(4, 0), (4, 3), (1, 0), (4, 2), (1, 3), (2, 0), (4, 1), (1, 2), (2, 3), (3, 0)], 0: [(4, 0), (4, 3), (1, 0), (4, 2), (1, 3), (2, 0), (4, 1), (1, 2), (2, 3), (3, 0)] } for d in d_pairs_ref.keys(): pairs_og = get_pairs_from_order_graph(cretention, keys, allow_overlap=True, d_lower=d, d_upper=np.inf) pairs = get_pairs_single_system(list(d_target.values()), d_lower=d, d_upper=np.inf) self.assertEqual(len(pairs_og), len(d_pairs_ref[d])) self.assertEqual(len(pairs), len(d_pairs_ref[d])) for pair in d_pairs_ref[d]: self.assertIn(pair, pairs_og) self.assertIn(pair, pairs)
def evaluate_on_target_systems(target_systems, training_systems, predictor, pair_params, kernel_params, opt_params, input_dir, estimator, feature_type, n_jobs=1, perc_for_training=100): """ Task: Evaluate rank-correlation, accuracy, etc. by learning an order predictor using the given set of training systems and prediction on the given set of target systems. For the evaluation we use either a repeated random-split of the target systems' data (if less than 75 examples are provided for test) or a cross-validation (else). The hyper-paramters of the order predictor are optimized using a nested cross-validation. The routines for that can be found in the file 'model_selection_cls.py'. If desired (excl_mol_by_struct_only == True), the molecular structures from the test set are removed from the training based on their molecular structure, e.g. by comparison of their InChIs, _even_ if these structures have been measured with another than the target system, i.e., another chromatographic system. See also the paper for details on the evaluation strategy. :param target_systems: list of strings, containing the target systems :param training_systems: list of strings, containing the training systems :param predictor: list of string, containing the predictors / molecular features used for the model construction. :param pair_params: dictionary, containing the paramters used for the creation of the RankSVM learning pairs, e.g. minimum and maximum oder distance. :param kernel_params: dictionary, containing the parameters for the kernels and generally for handling the input features / predictors. See definition of the dictionary in the __main__ of file 'evaluation_scenario_cls.py'. :param opt_params: dictionary, containing the paramters controlling the hyper-paramter optimization, number of cross-validation splits, etc. See definition of the dictionary in the __main__ of file 'evaluation_scenario_cls.py'. :param input_dir: string, directory containing the input data, e.g., fingerprints and retention times. :param estimator: string, order predictor to use: either "ranksvm" or "svr". :param feature_type: string, feature type that is used for the RankSVM. Currently only 'difference' features are supported, i.e., \phi_j - \phi_i is used for the decision. If the estimator is not RankSVM, but e.g. Support Vector Regression, than tis parameter can be set to None and is ignored. :param n_jobs: integer, number of jobs used for the hyper-parameter estimation. The maximum number of used jobs, is the number of inner splits (cross-validation or random split)! :param perc_for_training: scalar, percentage of the target systems data, that is used for the training, e.g., selected by simple random sub-sampling. This value only effects the training process, of the target system is in the set of training systems. :return: tuple of pandas.DataFrame 1) mapped_values: predicted order scores for each target system - corresponds to: w^\phi_i in the RankSVM case - corresponds to: the predicted retention time, in the SVR case 2) correlations: rank correlations of the order scores for each target system 3) accuracies: pairwise prediction accuracies for each target system 4) simple_statistics: number of training and test examples, etc. 5) grid_search_results: hyper-parameter scores for the different grid-parameters 6) grid_search_best_params: hyper-parameter scores for the best grid-parameters NOTE: The returned results (except mapped_values and grid search results) are averages across the different random splits / crossvalidation folds and repetitions. """ # Variables related to the number of random / cv splits, for inner (*_cv) # and outer fold (*_ncv). n_splits_shuffle = opt_params["n_splits_shuffle"] n_splits_nshuffle = opt_params["n_splits_nshuffle"] n_splits_cv = opt_params["n_splits_cv"] n_splits_ncv = opt_params["n_splits_ncv"] n_rep = opt_params["n_rep"] # Should molecules be excluded from the training, if their structure appears # in the test _even if_ they have been measured with another system than the # (current) target system: excl_mol_by_struct_only = opt_params["excl_mol_by_struct_only"] # Currently only 'slack_type == "on_pairs"' is supported. slack_type = opt_params["slack_type"] if slack_type != "on_pairs": raise ValueError("Invalid slack type: %s" % slack_type) # Should all possible pairs be used for the (inner) test split during the # parameter estimation, regardless of what are the settings for 'd_upper' # and 'd_lower'? all_pairs_for_test = opt_params["all_pairs_for_test"] if not estimator in ["ranksvm", "svr"]: raise ValueError("Invalid estimator: %s" % estimator) # RankSVM and SVR regularization parameter param_grid = {"C": opt_params["C"]} if estimator == "svr": # error-tube width of the SVR param_grid["epsilon"] = opt_params["epsilon"] # Molecule kernel if kernel_params["kernel"] == "linear": kernel = "linear" elif kernel_params["kernel"] in ["rbf", "gaussian"]: param_grid["gamma"] = kernel_params["gamma"] kernel = "rbf" elif kernel_params["kernel"] == "tanimoto": if estimator in ["ranksvm"]: kernel = tanimoto_kernel elif estimator in ["svr"]: kernel = tanimoto_kernel_mat elif kernel_params["kernel"] == "minmax": if estimator in ["ranksvm"]: kernel = minmax_kernel elif estimator in ["svr"]: kernel = minmax_kernel_mat else: raise ValueError("Invalid kernel: %s." % kernel_params["kernel"]) if isinstance(target_systems, str): target_systems = [target_systems] if isinstance(training_systems, str): training_systems = [training_systems] all_systems = list(set(target_systems).union(training_systems)) assert isinstance(target_systems, list) and isinstance( training_systems, list) n_target_systems = len(target_systems) n_training_systems = len(training_systems) print("Target systems (# = %d): %s" % (n_target_systems, ",".join(target_systems))) print("Training systems (# = %d): %s" % (n_training_systems, ",".join(training_systems))) ## Load the target and training systems into directories using (molecule, system)-keys ## and retention times respectively molecular features as values # If we use molecular descriptors, we need to scale the data, e.g. to [0, 1]. if kernel_params["scaler"] == "noscaling": scaler = None elif kernel_params["scaler"] == "minmax": scaler = MinMaxScaler() elif kernel_params["scaler"] == "std": scaler = StandardScaler() elif kernel_params["scaler"] == "l2norm": scaler = Normalizer() else: raise ValueError("Invalid scaler for the molecular features: %s" % kernel_params["scaler"]) # Handle counting MACCS fingerprints if predictor[0] == "maccsCount_f2dcf0b3": predictor_c = ["maccs"] predictor_fn = "fps_maccs_count.csv" else: predictor_c = predictor predictor_fn = None d_rts, d_features, d_system_index = OrderedDict(), OrderedDict( ), OrderedDict() for k_sys, system in enumerate(all_systems): rts, data = load_data(input_dir, system=system, predictor=predictor_c, pred_fn=predictor_fn) # Use (mol-id, system)-tupel as key keys = list(zip(rts.inchi.values, [system] * rts.shape[0])) # Values: retention time, features rts = rts.rt.values.reshape(-1, 1) data = data.drop("inchi", axis=1).values if kernel_params["poly_feature_exp"]: # If we use binary fingerprints, we can include some # interactions, e.g. x_1x_2, ... data = PolynomialFeatures(interaction_only=True, include_bias=False).fit_transform(data) # Make ordered directories d_rts[system], d_features[system] = OrderedDict(), OrderedDict() for i, key in enumerate(keys): d_rts[system][key] = rts[i, 0] d_features[system][key] = data[i, :] # Dictionary containing a unique numeric identifier for each system d_system_index[system] = k_sys if scaler is not None: if getattr(scaler, "partial_fit", None) is not None: # 'partial_fit' allows us to learn the parameters of the scaler # online. (great stuff :)) scaler.partial_fit(data) else: # We have scaler at hand, that does not allow online fitting. # This probably means, that this is a scaler, that performs # the desired scaling for each example independently, e.g. # sklearn.preprocessing.Normalizer. pass for system in target_systems: print("Target set '%s' contains %d examples." % (system, len(d_rts[system]))) # Collect all the data that is available for training. d_rts_training = join_dicts(d_rts, training_systems) d_features_training = join_dicts(d_features, training_systems) # (mol-id, system)-tuples used in the training set l_keys_training = list(d_features_training.keys()) # Data frames storing the evaluation measures mapped_values = { target_system: DataFrame() for target_system in target_systems } accuracies, correlations, simple_statistics = DataFrame(), DataFrame( ), DataFrame() grid_search_results, grid_search_best_params = DataFrame(), DataFrame() for idx_system, target_system in enumerate(target_systems): print("Process target system: %s (%d/%d)." % (target_system, idx_system + 1, len(target_systems))) # (mol-id, system)-tuples in the target set l_keys_target = list(d_features[target_system].keys()) for i_rep in range(n_rep): print("Repetition: %d/%d" % (i_rep + 1, n_rep)) # Get a random subset of the training data l_keys_training_sub = sample_perc_from_list(l_keys_training, tsystem=target_system, perc=perc_for_training, random_state=747 * i_rep) print("Training set contains %d (%f%%) examples." % (len(l_keys_training_sub), 100 * len(l_keys_training_sub) / len(l_keys_training))) for training_system in training_systems: n_train_sys_sub = sum( np.array(list(zip( *l_keys_training_sub))[1]) == training_system) n_train_sys = sum( np.array(list(zip( *l_keys_training))[1]) == training_system) print("\tSystem %s contributes %d (%f%%) examples." % (training_system, n_train_sys_sub, 100 * n_train_sys_sub / n_train_sys)) # Check whether the target system has any overlap with training system print("Outer validation split strategy: ", end="", flush=True) l_molids_training = list(zip(*l_keys_training_sub))[0] l_molids_target = list(zip(*l_keys_target))[0] if (excl_mol_by_struct_only and (len (set (l_molids_training) & set (l_molids_target)) == 0)) or \ (not excl_mol_by_struct_only and (len (set (l_keys_training_sub) & set (l_keys_target)) == 0)): print( "Predefined split:\n" "\tTraining and target do not share molecular structures " "(excl_mol_by_struct_only=%d)" % excl_mol_by_struct_only) cv_outer = PredefinedSplit(np.zeros(len(l_keys_target))) else: # Determine strategy for training / test splits if len(l_keys_target) < 75: print("ShuffleSplit") train_size = 0.75 cv_outer = ShuffleSplit(n_splits=n_splits_shuffle, train_size=train_size, test_size=(1 - train_size), random_state=320 * i_rep) else: print("KFold") cv_outer = KFold(n_splits=n_splits_cv, shuffle=True, random_state=320 * i_rep) # Performance evaluation using cross-validation / random splits for i_fold, (_, test_set) in enumerate(cv_outer.split(l_keys_target)): print("Outer fold: %d/%d" % (i_fold + 1, cv_outer.get_n_splits())) # (mol-id, system)-tuples in the test subset of the target set l_keys_target_test = [l_keys_target[idx] for idx in test_set] # Remove test subset of the target set from the training set. # NOTE: The training set might contain the whole target set. l_molids_target_test = list(zip(*l_keys_target_test))[0] if excl_mol_by_struct_only: l_keys_training_train = [ key for key in l_keys_training_sub if key[0] not in l_molids_target_test ] else: l_keys_training_train = [ key for key in l_keys_training_sub if key not in l_keys_target_test ] if isinstance(cv_outer, PredefinedSplit): print("Shuffle pre-defined split.") rs_old = np.random.get_state() np.random.seed(320 * i_fold) # If we use the pre-defined splits we need to shuffle by our self. # In that way we prevent bias during the h-param estimation. np.random.shuffle( l_keys_training_train) # Shuffle is done inplace np.random.set_state(rs_old) l_molids_training_train = list(zip(*l_keys_training_train))[0] if excl_mol_by_struct_only: assert (len( set(l_molids_target_test) & set(l_molids_training_train)) == 0) else: assert (len( set(l_keys_target_test) & set(l_keys_training_train)) == 0) # Determine strategy for training / test splits (inner) print("Inner (h-param) validation split strategy: ", end="", flush=True) if len(l_keys_training_train) < 75: print("GroupShuffleSplit") train_size = 0.75 cv_inner = GroupShuffleSplit(n_splits=n_splits_nshuffle, train_size=train_size, test_size=(1 - train_size), random_state=350 * i_fold * i_rep) else: print("GroupKFold") cv_inner = GroupKFold(n_splits=n_splits_ncv) # Train the rankSVM: Find optimal set of hyper-parameters od_rts_training_train, od_features_training_train = OrderedDict( ), OrderedDict() for key in l_keys_training_train: od_rts_training_train[key] = d_rts_training[key] od_features_training_train[key] = d_features_training[key] start_time = time.time() if estimator == "ranksvm": best_params, cv_results, n_train_pairs, ranking_model, _, _ = find_hparan_ranksvm( estimator=KernelRankSVC(kernel=kernel, slack_type=slack_type, random_state=319 * i_fold * i_rep), fold_score_aggregation="weighted_average", X=od_features_training_train, y=od_rts_training_train, param_grid=param_grid, cv=cv_inner, pair_params=pair_params, n_jobs=n_jobs, scaler=scaler, all_pairs_as_test=all_pairs_for_test) elif estimator == "svr": best_params, cv_results, n_train_pairs, ranking_model = find_hparam_regression( estimator=SVRPairwise(kernel=kernel), X=od_features_training_train, y=od_rts_training_train, param_grid=param_grid, cv=cv_inner, n_jobs=n_jobs, scaler=scaler) else: raise ValueError("Invalid estimator: %s" % estimator) rtime_gcv = time.time() - start_time print("[find_hparam_*] %.3fsec" % rtime_gcv) # Store the grid-search statistics for further analyses grid_search_results_tmp = DataFrame(cv_results) grid_search_results_tmp["target_system"] = target_system grid_search_results_tmp["training_systems"] = ";".join( training_systems) grid_search_results = grid_search_results.append( grid_search_results_tmp) grid_search_best_params_tmp = DataFrame([best_params]) grid_search_best_params_tmp["target_system"] = target_system grid_search_best_params_tmp["training_systems"] = ";".join( training_systems) grid_search_best_params = grid_search_best_params.append( grid_search_best_params_tmp) print(grid_search_best_params_tmp) ## Do prediction for the test set # Calculate: w' * \phi(x_i), for all molecules i X_test, rts_test = [], [] for key in l_keys_target_test: rts_test.append(d_rts[target_system][key]) X_test.append(d_features[target_system][key]) rts_test = np.array(rts_test).reshape(-1, 1) X_test = np.array(X_test) if scaler is not None: X_test = scaler.transform(X_test) if estimator == "ranksvm": Y_pred_test = ranking_model.predict(X_test, X_test) elif estimator == "svr": Y_pred_test = ranking_model.predict(X_test) else: raise ValueError("Invalid estimator: %s" % estimator) wTx = ranking_model.map_values(X_test) mapped_values[target_system] = pd.concat([ mapped_values[target_system], DataFrame({ "mapped_value": wTx, "true_rt": rts_test.flatten(), "inchi": l_molids_target_test }) ], ignore_index=True) correlations = correlations.append( { "rank_corr": sp.stats.kendalltau(wTx, rts_test)[0], "spear_corr": sp.stats.spearmanr(wTx, rts_test)[0], "target_system": target_system, "training_system": ";".join(training_systems) }, ignore_index=True) n_train_mol = len(set(l_molids_training_train)) n_test_mol = len(set(l_molids_target_test)) n_shared_mol = len( set(l_molids_target_test) & (set(l_molids_training_train))) p_shared_mol = float(n_shared_mol) / n_test_mol # Predict: x_i > x_j or x_i < x_j for all molecule pairs (i, j) with Timer("Get prediction score"): for d_lower, d_upper in itertools.product( [0] + list(range(1, 15, 2)), 2**np.array([0, 1, 2, 3, 4, 5, 6, np.inf])): if d_lower > d_upper: continue pairs_test = get_pairs_single_system(rts_test, d_lower=d_lower, d_upper=d_upper) accuracies = accuracies.append( { "score_w": ranking_model.score_using_prediction( Y_pred_test, pairs_test, normalize=False), "score": ranking_model.score_using_prediction( Y_pred_test, pairs_test), "n_pairs_test": len(pairs_test), "target_system": target_system, "training_system": ";".join(training_systems), "d_lower": d_lower, "d_upper": d_upper, "i_rep": i_rep }, ignore_index=True) # Write out how many molecular structures are shared between the target and training systems n_test_pairs = len(pairs_test) simple_statistics = simple_statistics.append( { "n_shared_mol": n_shared_mol, "p_shared_mol": p_shared_mol, "n_train_mol": n_train_mol, "n_test_mol": n_test_mol, "n_train_pairs": n_train_pairs, "n_test_pairs": n_test_pairs, "grid_search_time": rtime_gcv, "target_system": target_system, "training_systems": ";".join(training_systems), "d_lower": d_lower, "d_upper": d_upper }, ignore_index=True) # Average the mapped values over the repetitions for target_system in target_systems: mapped_values[target_system]["mapped_value_std"] = mapped_values[ target_system]["mapped_value"] mapped_values[target_system] = mapped_values[target_system].groupby( ["inchi"], as_index=False).agg({ "mapped_value": np.mean, "mapped_value_std": np.std, "true_rt": np.unique }) # Aggregate the rows in 'correlations' to get the mean- and std-values across the folds. correlations["rank_corr_std"] = correlations["rank_corr"] correlations["spear_corr_std"] = correlations["spear_corr"] correlations = correlations.groupby(["target_system", "training_system"], as_index=False).agg({ "rank_corr": np.mean, "rank_corr_std": np.std, "spear_corr": np.mean, "spear_corr_std": np.std }) # Aggregate the rows in 'accuracies' to get the expected pairwise accuracy accuracies = accuracies.groupby( ["target_system", "training_system", "d_lower", "d_upper", "i_rep"], as_index=False).agg({ "score_w": np.sum, "n_pairs_test": np.sum, "score": np.mean }) accuracies["score_w"] = accuracies["score_w"] / accuracies["n_pairs_test"] accuracies.drop(["i_rep", "n_pairs_test"], axis=1, inplace=True) # Calculate expected accuracy across the repetitions accuracies["score_w_std"] = accuracies["score_w"] accuracies["score_std"] = accuracies["score"] accuracies = accuracies.groupby( ["target_system", "training_system", "d_lower", "d_upper"], as_index=False).agg({ "score_w": np.mean, "score_w_std": np.std, "score": np.mean, "score_std": np.std }) # Aggregate the simple statistics simple_statistics["n_shared_mol_std"] = simple_statistics["n_shared_mol"] simple_statistics["p_shared_mol_std"] = simple_statistics["p_shared_mol"] simple_statistics["n_train_mol_std"] = simple_statistics["n_train_mol"] simple_statistics["n_test_mol_std"] = simple_statistics["n_test_mol"] simple_statistics["n_train_pairs_std"] = simple_statistics["n_train_pairs"] simple_statistics["n_test_pairs_std"] = simple_statistics["n_test_pairs"] simple_statistics["grid_search_time_std"] = simple_statistics[ "n_test_pairs"] simple_statistics = simple_statistics.groupby( ["target_system", "training_systems", "d_lower", "d_upper"], as_index=False).agg({ "n_shared_mol": np.mean, "p_shared_mol": np.mean, "n_train_mol": np.mean, "n_test_mol": np.mean, "n_train_pairs": np.mean, "n_test_pairs": np.mean, "grid_search_time": np.mean, "n_shared_mol_std": np.std, "p_shared_mol_std": np.std, "n_train_mol_std": np.std, "n_test_mol_std": np.std, "n_train_pairs_std": np.std, "n_test_pairs_std": np.std, "grid_search_time_std": np.std }) return mapped_values, correlations, accuracies, simple_statistics, grid_search_results, grid_search_best_params
def single_dataset(X, target, kernel, convergence_criteria="alpha_change_max", t_0=0.5, tol=0.001, slack_type="on_pairs", fig_fn=None, C=1, step_size_algorithm="diminishing_2"): fig, axes = plt.subplots(2, 3) fig.suptitle( dict2str( { "convergence_criteria": convergence_criteria, "step_size_algorithm": step_size_algorithm, "slack_type": slack_type }, sep=" ; ")) if fig_fn is not None: fig.set_size_inches(14, 8) if X.shape[1] == 2: visualize_dataset2(X[:, 0], X[:, 1], target, axes[0, 0], title="Dataset: %s" % type) else: # Do some low dimensional embedding, e.g. given a set of fingerprints. pass # Get training and test split train_set, test_set = list( ShuffleSplit(n_splits=1, train_size=0.75, test_size=0.25, random_state=666).split(X))[0] pairs_train = get_pairs_single_system(target[train_set], d_lower=0, d_upper=16) pairs_train_full = get_pairs_single_system(target[train_set], d_lower=0, d_upper=np.inf) pairs_test = get_pairs_single_system(target[test_set], d_lower=0, d_upper=np.inf) alpha = np.zeros((len(pairs_train), 1)) t_0_ = t_0 k = 1 max_iter = 25 n_steps = 80 f_0s = np.array([]) # primal objective values during optimization gs = np.array([]) # dual objective values during optimization dgs = np.array([]) # duality gap rdgs = np.array([]) score_train = [] score_test = [] rank_corr_train = [] rank_corr_test = [] x_space = [] for ii in range(n_steps): print("%d: " % ii, end="", flush=True) ranksvm = KernelRankSVC(C=C, debug=1, kernel=kernel, slack_type=slack_type, t_0=t_0_, convergence_criteria=convergence_criteria, max_iter=k + (max_iter - 1), step_size_algorithm=step_size_algorithm, random_state=101, degree=2, gamma=0.5, tol=tol, verbose=True) # Set stepsize to latest stepsize ranksvm.t_0 = t_0_ ranksvm.fit(None, None, fit_params={ "FX": X[train_set], "pairs": pairs_train, "alpha_init": alpha, "k_init": k }) # Store the alpha as initial value for the next round alpha = ranksvm.alpha.reshape((-1, 1)) # Get last stepsize print("Stepsize: t_0 = %f, t_conv = %f; Iteration: k = %d" % (t_0, ranksvm._t_convergence, ranksvm._k_convergence)) if step_size_algorithm == "diminishing_2": t_0_ = ranksvm._get_step_size_diminishing_2(ranksvm._t_convergence) if ranksvm._obj_has_converged: break else: k = ranksvm._k_convergence + 1 # Get internal optimization results f_0s = np.concatenate((f_0s, np.array(ranksvm.f_0s).flatten())) gs = np.concatenate((gs, np.array(ranksvm.gs).flatten())) dgs = np.concatenate((dgs, np.array(ranksvm.dgs).flatten())) rdgs = np.concatenate((rdgs, np.array(ranksvm.rdgs).flatten())) # Get pairwise score and rank_correlation (train set) score_train.append(ranksvm.score(X[train_set], pairs_train_full)) rank_corr_train.append( sp.stats.kendalltau(ranksvm.map_values(X[train_set]), target[train_set])[0]) # Get pairwise score and rank_correlation (test set) score_test.append(ranksvm.score(X[test_set], pairs_test)) rank_corr_test.append( sp.stats.kendalltau(ranksvm.map_values(X[test_set]), target[test_set])[0]) x_space.append(ranksvm._k_convergence) print("Iterations: %d" % ranksvm._k_convergence) # Run RankSVM optimization until convergence ranksvm = KernelRankSVC(C=C, verbose=True, kernel=kernel, slack_type=slack_type, t_0=t_0, convergence_criteria=convergence_criteria, step_size_algorithm=step_size_algorithm, degree=2, gamma=0.5, tol=tol, max_iter=max_iter * n_steps, random_state=101) ranksvm.fit(None, None, fit_params={ "FX": X[train_set], "pairs": pairs_train }) visualize_ranksvm(target[test_set], ranksvm.map_values(X[test_set]), axes[0, 1], title="Iter: %d (max = %d)" % (ranksvm._k_convergence, ranksvm.max_iter)) rc_train_line = axes[0, 2].plot(x_space, rank_corr_train, color="blue", linestyle="-") rc_test_line = axes[0, 2].plot(x_space, rank_corr_test, color="red", linestyle="-") axes[0, 2].set_xlabel("Iteration") axes[0, 2].set_ylabel("Rank-correlation") axes[0, 2].set_title("C = %.3f" % ranksvm.C) axes[0, 2].grid(True) axes[0, 2].legend((rc_train_line[0], rc_test_line[0]), ("Train", "Test")) sc_train_line = axes[1, 0].plot(x_space, score_train, color="blue", linestyle="-") sc_test_line = axes[1, 0].plot(x_space, score_test, color="red", linestyle="-") axes[1, 0].set_xlabel("Iteration") axes[1, 0].set_ylabel("Pairwise accuracy") axes[1, 0].set_title("C = %.3f" % ranksvm.C) axes[1, 0].grid(True) axes[1, 0].legend((sc_train_line[0], sc_test_line[0]), ("Train", "Test")) # Primal and dual objective f_0_line = axes[1, 1].semilogy(f_0s, color="blue", linestyle="-") g_line = axes[1, 1].semilogy(gs, color="red", linestyle="-") axes[1, 1].grid(True) axes[1, 1].set_xlabel("Iteration") axes[1, 1].set_ylabel("Objective value") axes[1, 1].legend((f_0_line[0], g_line[0]), ( "Primal", "Dual", ), title="Objectives") # Duality gap dgs_line = axes[1, 2].semilogy(dgs, "green", linestyle="-") rdgs_line = axes[1, 2].semilogy(rdgs, "red", linestyle="-") axes[1, 2].set_xlabel("Iteration") axes[1, 2].set_ylabel("Duality gap") axes[1, 2].grid(True) axes[1, 2].legend((dgs_line[0], rdgs_line[0]), ( "Absolute", "Relative", ), title="Objectives") if fig_fn is not None: plt.savefig(fig_fn) else: plt.show()
def compare_datasets2(fig_fn=None): fig, axes = plt.subplots(3, 2) if not fig_fn is None: fig.set_size_inches(12, 8) # for k, type in enumerate (["linear", "quadratic", "open_circle"]): for k, type in enumerate(["quadratic"]): print("Type: %s" % type) X, target, d_X, d_target = create_artificial_dataset2(type=type, n=150) keys = list(d_X.keys()) visualize_dataset2(X[:, 0], X[:, 1], target, axes[k, 0], title="Dataset: %s" % type) target_pred = np.zeros(len(X)) mean_score = 0.0 param_grid = {"C": [1]} cv = RepeatedKFold(n_splits=10, n_repeats=1, random_state=1) for k_cv, (train_set, test_set) in enumerate(cv.split(keys)): print("Fold %d / %d" % (k_cv + 1, cv.get_n_splits())) keys_train = [keys[idx] for idx in train_set] keys_test = [keys[idx] for idx in test_set] d_X_train, d_target_train = OrderedDict(), OrderedDict() for key in keys_train: d_X_train[key] = d_X[key] d_target_train[key] = d_target[key] # d_X_train = {key: value for key, value in d_X.items() if key in keys_train} # d_target_train = {key: value for key, value in d_target.items() if key in keys_train} if type == "linear": ranksvm_kernel = KernelRankSVC( verbose=False, kernel="linear", feature_type="difference", slack_type="on_pairs", step_size_algorithm="diminishing_2", convergence_criteria="alpha_change_norm") elif type == "quadratic": ranksvm_kernel = KernelRankSVC(verbose=False, kernel="poly", feature_type="difference", slack_type="on_pairs") param_grid["degree"] = [2] elif type == "open_circle": ranksvm_kernel = KernelRankSVC(verbose=False, kernel="rbf", feature_type="difference", slack_type="on_pairs") param_grid["gamma"] = [3] else: raise ValueError("Invalid test data type: %s" % type) cv_inner = GroupKFold(n_splits=3) best_params, param_scores, n_pairs_train, best_estimator, _, _ = find_hparan_ranksvm( ranksvm_kernel, d_X_train, d_target_train, cv=cv_inner, param_grid=param_grid, pair_params={ "allow_overlap": True, "d_upper": 4, "d_lower": 0, "ireverse": True }, n_jobs=1) print(best_params) X_test = np.array([d_X[key] for key in keys_test]) target_test = np.array([d_target[key] for key in keys_test]) pairs_test = get_pairs_single_system(target_test, d_lower=0, d_upper=np.inf) target_pred[test_set] += best_estimator.map_values(X_test) score = best_estimator.score(X_test, pairs_test) print(score) mean_score += score target_pred /= cv.get_n_splits() mean_score /= cv.get_n_splits() print(mean_score) visualize_ranksvm([d_target[key] for key in keys], target_pred, axes[k, 1]) if not fig_fn is None: plt.tight_layout() plt.savefig(fig_fn) else: plt.show()
def compare_datasets3(fig_fn=None): fig, axes = plt.subplots(3, 2) fig_conv, axes_conv = plt.subplots(3, 3) if not fig_fn is None: fig.set_size_inches(12, 8) fig_conv.set_size_inches(12, 8) # for k, type in enumerate (["linear", "quadratic", "open_circle"]): for k, type in enumerate(["linear"]): print("Type: %s" % type) X, target, d_X, d_target = create_artificial_dataset2(type=type, n=400) keys = list(d_X.keys()) visualize_dataset2(X[:, 0], X[:, 1], target, axes[k, 0], title="Dataset: %s" % type) param_grid = {"C": [1]} train_set, test_set = list( ShuffleSplit(n_splits=1, train_size=0.75, test_size=0.25).split(keys))[0] keys_train = [keys[idx] for idx in train_set] keys_test = [keys[idx] for idx in test_set] d_X_train, d_target_train = OrderedDict(), OrderedDict() for key in keys_train: d_X_train[key] = d_X[key] d_target_train[key] = d_target[key] if type == "linear": ranksvm_kernel = KernelRankSVC(verbose=True, debug=2, kernel="linear", feature_type="difference", slack_type="on_pairs", step_size_algorithm="diminishing", convergence_criteria="gs_change") elif type == "quadratic": ranksvm_kernel = KernelRankSVC(verbose=True, debug=2, kernel="poly", feature_type="difference", slack_type="on_examples") param_grid["degree"] = [2] elif type == "open_circle": ranksvm_kernel = KernelRankSVC(verbose=True, debug=2, kernel="rbf", feature_type="difference", slack_type="on_examples") param_grid["gamma"] = [3] else: raise ValueError("Invalid test data type: %s" % type) best_params, param_scores, n_pairs_train, best_estimator, _, _ = find_hparan_ranksvm( ranksvm_kernel, d_X_train, d_target_train, cv=None, param_grid=param_grid, pair_params={ "allow_overlap": True, "d_upper": 8, "d_lower": 0, "ireverse": True }, n_jobs=1, scaler=None) print("Best params:", best_params) X_test = np.array([d_X[key] for key in keys_test]) target_test = np.array([d_target[key] for key in keys_test]) pairs_test = get_pairs_single_system(target_test, d_lower=0, d_upper=np.inf) target_pred = best_estimator.map_values(X_test) print("Score: %f" % best_estimator.score(X_test, pairs_test)) visualize_ranksvm(target_test, target_pred, axes[k, 1]) inspect_convergence(best_estimator, np.array(list(d_target_train.values())), axes_conv[k, :]) if not fig_fn is None: plt.tight_layout() plt.savefig(fig_fn) else: plt.show(fig) plt.show(fig_conv)
def compare_datasets(fig_fn=None): fig, axes = plt.subplots(3, 4) if not fig_fn is None: fig.set_size_inches(12, 8) for k, type in enumerate(["linear", "quadratic", "open_circle"]): print("Type: %s" % type) X, target = create_artificial_dataset(type=type, n=50, random_state=1001) # print (X[range(10)]) # pairs = get_pairs ({"X": X, "target": target}) # X_diff, y_clf = get_pairwise_features2 (X, pairs, balance_classes = True) # # # Visualize dataset and feature space # visualize_dataset ({"X": X, "target": target}, {"X_diff": X_diff, "y_clf": y_clf}, axes[k]) # Train a linear rankSVM # target_pred_linear = np.zeros (len (X)) # target_pred_kernel = np.zeros (len (X)) mean_score = 0.0 # cv = KFold (n_splits = 10, random_state = 646, shuffle = True) cv = PredefinedSplit(np.zeros(len(X))) # for k_cv, (_, test_set) in enumerate (cv.split (X)): for i_rep in range(10): # print ("Fold %d / %d" % (k_cv + 1, cv.n_splits)) # print (train_set[range(10)], test_set[range(10)]) # train_set = test_set = range(len(X)) pairs_train = get_pairs_single_system(target[train_set], d_lower=0, d_upper=4) pairs_test = get_pairs_single_system(target[test_set], d_lower=0, d_upper=np.inf) # ranksvm_linear = linear_rank_svm (verbose = False) # ranksvm_linear.train (X[train_set], pairs_train, n_jobs = 2, C = [1]) # # target_pred_linear[test_set] = ranksvm_linear.map_values (X[test_set])[0] ranksvm_kernel = KernelRankSVC(C=0.1, verbose=False, kernel="precomputed", feature_type="difference") if type == "linear": KX_train = linear_kernel(X[train_set], X[train_set]) KX_train_test = linear_kernel(X[train_set], X[test_set]) elif type == "quadratic": KX_train = polynomial_kernel(X[train_set], X[train_set], degree=2) KX_train_test = polynomial_kernel(X[train_set], X[test_set], degree=2) elif type == "open_circle": KX_train = rbf_kernel(X[train_set], X[train_set], gamma=3) KX_train_test = rbf_kernel(X[train_set], X[test_set], gamma=3) else: raise ValueError("Invalid test data type: %s" % type) ranksvm_kernel.fit(np.arange(KX_train.shape[0]), None, fit_params={ "KX": KX_train, "pairs": pairs_train }) score = ranksvm_kernel.score(KX_train_test, pairs_test) mean_score += score print(score) # target_pred_kernel[test_set] = ranksvm_kernel.map_values (KX_train_test) print(mean_score / 10) # visualize_ranksvm (target[:, 0], target_pred_linear, axes[k, 2]) # visualize_ranksvm (target[:, 0], target_pred_kernel, axes[k, 3]) if not fig_fn is None: plt.tight_layout() plt.savefig(fig_fn) else: plt.show()