def neighbors(train, test, target, cv: PredefinedSplit, k=5, n_trees=10):
    res_train = np.zeros((train.shape[0], 2))
    res_test = np.zeros((test.shape[0], 2))
    for i, (trn_idx, val_idx) in tqdm(enumerate(cv.split(train)),
                                      total=cv.get_n_splits()):
        target_trn = target.iloc[trn_idx]
        X_trn = train.iloc[trn_idx]
        X_val = train.iloc[val_idx]
        n = X_trn[target_trn == 0]
        p = X_trn[target_trn == 1]
        for j, X in enumerate([n, p]):
            u = build(X, n_trees)
            res_train[val_idx, j] = get_feat(X_val, u, k=k)
            res_test[:, j] += get_feat(test, u, k)
    res_test /= cv.get_n_splits()
    return res_train, res_test
Esempio n. 2
0
def target_encoding(X_train, y_train, X_test, cols, cv_id):
    cols = list(cols)
    train_new = X_train.copy()
    test_new = X_test.copy()
    test_new[:] = 0
    cv = PredefinedSplit(cv_id)
    X_train.index = X_train.index.astype(int)
    for trn_idx, val_idx in tqdm(cv.split(X_train), total=cv.get_n_splits()):
        enc = TargetEncoder(cols=cols)
        enc.fit(X_train.iloc[trn_idx], y_train[trn_idx])
        train_new.iloc[val_idx] = enc.transform(X_train.iloc[val_idx])
        test_new += enc.transform(X_test)
    test_new /= cv.get_n_splits()
    train_new = train_new[cols]
    test_new = test_new[cols]
    train_new.columns = train_new.columns + '_target'
    test_new.columns = test_new.columns + '_target'
    print(list(train_new.columns))
    return train_new, test_new
Esempio n. 3
0
def test_predefined_split():
    cv = PredefinedSplit(np.array(list(range(4)) * 5))
    cv2 = PredefinedSplit(np.array(list(range(5)) * 4))
    assert tokenize(cv) == tokenize(cv)
    assert tokenize(cv) != tokenize(cv2)

    sol = cv.get_n_splits(np_X, np_y, np_groups)
    assert compute_n_splits(cv, np_X, np_y, np_groups) == sol

    with assert_dask_compute(False):
        assert compute_n_splits(cv, da_X, da_y, da_groups) == sol
Esempio n. 4
0
def main(argv):
    start_time = datetime.now()
    logger.info("START")
    args = argparser.parse_args()
    inFile = args.inFile
    testFile = args.testFile
    nameModel = args.nameModel
    conf_file = args.mod
    mod = __import__(conf_file, fromlist=['*'])
    model_conf = mod.gridSearch_Model_types[nameModel]
    conf = getattr(__import__(conf_file, fromlist=[model_conf]), model_conf)
    prefix_dict = conf['prefix_dict']
    out_dict = h.outfileName(fo=args.outFile,
                             fi=inFile,
                             prefix_dict=prefix_dict,
                             add_date=True)
    logger.info("RUNNING WITH MOD: %s, INFILE: %s" % (conf_file, inFile))
    logger.info("LOADING THE DATA SET")
    param_grid = PARAM_DICT[nameModel]
    # scoring = {'Accuracy': make_scorer(accuracy_score),'RMS':make_scorer(mean_squared_error)}
    scoring = {'RMS': make_scorer(r2_score)}
    X, Y, len_train, numFeatures = readFile(inFile)
    cv = None
    if testFile:
        logger.info("USING TEST FILE %s AS TEST SET FOR THE CORSS VALIDATION" %
                    testFile)
        X_test, Y_test, len_train_test, numFeatures_test = readFile(inFile)
        X = pd.concat([X, X_test], ignore_index=True)
        Y = pd.concat([Y, Y_test], ignore_index=True)
        cv_arr = [1] * len_train
        cv_arr.extend([0] * len_train_test)
        cv = PredefinedSplit(test_fold=cv_arr)
        print("Stampa di cv: ", cv)
        print("numero di fold", cv.get_n_splits())
        for train_index, test_index in cv.split():
            print("TRAIN:", train_index, "TEST:", test_index)
        logger.info("SHAPE OF X:%s AND Y:%s AFTER APPEND", X.shape, Y.shape)
    logger.info("CREATION OF THE MODEL")
    t = TestClass(conf=conf, nm=nameModel, nf=numFeatures)
    if nameModel == 'NN':
        model = KerasClassifier(build_fn=t.createModelNN)
        X = X.as_matrix()
        Y = Y.as_matrix()
    else:
        model = t.selectModel()
    logger.info("START GRID SEARCH")
    grid_result = gridSearch(model, param_grid, cv, X, Y, scoring)
    logger.info("END OF GRID SEARCH")
    logger.info("PRINTING RESULTS")
    gridResults(grid_result, X, nameModel)
    SaveModel(nameModel, grid_result)
    logger.info("EXECUTED IN %f SEC" %
                ((datetime.now() - start_time)).total_seconds())
    logger.info("END")
Esempio n. 5
0
def test_predefined_split():
    cv = PredefinedSplit(np.array(list(range(4)) * 5))
    cv2 = PredefinedSplit(np.array(list(range(5)) * 4))
    assert tokenize(cv) == tokenize(cv)
    assert tokenize(cv) != tokenize(cv2)

    sol = cv.get_n_splits(np_X, np_y, np_groups)
    assert compute_n_splits(cv, np_X, np_y, np_groups) == sol

    with assert_dask_compute(False):
        assert compute_n_splits(cv, da_X, da_y, da_groups) == sol
Esempio n. 6
0
def aggregate_fold_stats(db_paths, cv_pkl_file):
    preprocessed_db = imglmdb.multidbwrapper(sorted(db_paths))
    with open(cv_pkl_file, "rb") as pkl:
        test_fold, nested_test_folds = pickle.load(pkl)

    splitter = PredefinedSplit(test_fold)

    data = [{}] * splitter.get_n_splits()

    for i, (nested_test_fold,
            (_,
             test_idx)) in enumerate(zip(nested_test_folds, splitter.split())):
        per_pixel_stats = preprocessing.compute_per_pixel_stats(
            preprocessed_db, None, idx=test_idx)
        std_per_pixel = numpy.where(per_pixel_stats[1] == 0.0, 1,
                                    per_pixel_stats[1])
        data[i]["outer"] = (per_pixel_stats[0], std_per_pixel)

        nested_splitter = PredefinedSplit(nested_test_fold)
        data[i]["nested"] = [{}] * nested_splitter.get_n_splits()

        for j, (train_idx, val_idx) in enumerate(nested_splitter.split()):
            per_pixel_stats = preprocessing.compute_per_pixel_stats(
                preprocessed_db, None, idx=train_idx)
            std_per_pixel = numpy.where(per_pixel_stats[1] == 0.0, 1,
                                        per_pixel_stats[1])
            data[i]["nested"][j]["train"] = (per_pixel_stats[0], std_per_pixel)

            per_pixel_stats = preprocessing.compute_per_pixel_stats(
                preprocessed_db, None, idx=val_idx)
            std_per_pixel = numpy.where(per_pixel_stats[1] == 0.0, 1,
                                        per_pixel_stats[1])
            data[i]["nested"][j]["val"] = (per_pixel_stats[0], std_per_pixel)

    with open(os.path.splitext(cv_pkl_file)[0] + "_stats.pkl", "wb") as pkl:
        pickle.dump(data, pkl)

    return data
def test_predefinedsplit_with_kfold_split():
    # Check that PredefinedSplit can reproduce a split generated by Kfold.
    folds = -1 * np.ones(10)
    kf_train = []
    kf_test = []
    for i, (train_ind, test_ind) in enumerate(KFold(5, shuffle=True).split(X)):
        kf_train.append(train_ind)
        kf_test.append(test_ind)
        folds[test_ind] = i
    ps_train = []
    ps_test = []
    ps = PredefinedSplit(folds)
    # n_splits is simply the no of unique folds
    assert_equal(len(np.unique(folds)), ps.get_n_splits())
    for train_ind, test_ind in ps.split():
        ps_train.append(train_ind)
        ps_test.append(test_ind)
    assert_array_equal(ps_train, kf_train)
    assert_array_equal(ps_test, kf_test)
Esempio n. 8
0
def test_predefinedsplit_with_kfold_split():
    # Check that PredefinedSplit can reproduce a split generated by Kfold.
    folds = -1 * np.ones(10)
    kf_train = []
    kf_test = []
    for i, (train_ind, test_ind) in enumerate(KFold(5, shuffle=True).split(X)):
        kf_train.append(train_ind)
        kf_test.append(test_ind)
        folds[test_ind] = i
    ps_train = []
    ps_test = []
    ps = PredefinedSplit(folds)
    # n_splits is simply the no of unique folds
    assert_equal(len(np.unique(folds)), ps.get_n_splits())
    for train_ind, test_ind in ps.split():
        ps_train.append(train_ind)
        ps_test.append(test_ind)
    assert_array_equal(ps_train, kf_train)
    assert_array_equal(ps_test, kf_test)
Esempio n. 9
0
                            break
                    X_test = np.array(new_x)
                    y_test = np.array(new_y)
                    print(X_test.shape, y_test.shape, len(y_test[y_test == 0]),
                          len(y_test[y_test == 1]))
                    assert X_test.shape[1] == tmp_shape[1]
                    assert X_test.shape[0] >= tmp_shape[0]
                    assert len(y_test[y_test == 0]) == len(y_test[y_test == 1])

                # leave person out each fold
                test_fold = np.concatenate(
                    [[0] * 43, [1] * 43, [2] * 43, [3] * 43, [4] * 43,
                     [5] * 43, [6] * 43, [7] * 43,
                     [-1] * ((nsubjects * (nclips - 1)) - (8 * nsubjects))])
                gkf = PredefinedSplit(test_fold)
                print('split train set into:', gkf.get_n_splits(), 'folds')

                # We will use a Support Vector Classifier with class_weight balanced
                svm = SVC(class_weight='balanced')
                clf_best = GridSearchCV(
                    estimator=svm,
                    param_grid=p_grid,
                    cv=gkf,
                    iid=False,
                    scoring=['accuracy', 'balanced_accuracy', 'f1_macro'],
                    refit='f1_macro'
                )  # get params that give best 'refit' value
                clf_best.fit(X_train, y_train)
                y_pred = clf_best.predict(X_train)
                train_f1 = clf_best.best_score_
Esempio n. 10
0
raw_test = read_idx("./Bases/MNIST/t10k-images-idx3-ubyte.gz")
test_data = raw_test.reshape(10000, 28 * 28)
test_label = read_idx("./Bases/MNIST/t10k-labels-idx1-ubyte.gz")
'''
amostra=base_MNIST.head()
base_MNIST = pd.DataFrame(data=amostra)
base_MNIST.to_excel("./Bases/MNIST.xlsx")
'''

X_total = np.concatenate(
    (train_data,
     test_data)) / 255.0  # padronização pras variáveis ficarem entre 0 e 1
Y_total = np.concatenate((train_label, test_label))
base_sep = np.repeat([-1, 0], [60000, 10000])
ps = PredefinedSplit(base_sep)
'''
ps.get_n_splits()
for train_index, test_index in ps.split():
    print("TRAIN:", train_index, "TEST:", test_index)
'''

# visualiza os dados
foto = 123
fig = plt.figure(figsize=(2, 2))
ax = fig.add_subplot(111)
ax.set_axis_off()
ax.imshow(raw_train[foto, :], cmap=plt.cm.gray_r, interpolation='nearest')
ax.set_title("valor do target = " + np.str(train_label[foto]))

# diminue/filtra a base de trainamento
idx = (train_label == 2) | (train_label == 3) | (train_label == 8)
Esempio n. 11
0
def main():
    parser = ArgumentParser()
    add_args(parser)
    args = parser.parse_args()
    dsmoothedTraj = pickle.load(open(args.inputfile, 'rb'))
    addsmooth(dsmoothedTraj)
    ltraj = [
        smo for smo in dsmoothedTraj.values() if smo.trajff.shape[0] > 100
    ]
    print([smo.trajff.shape[0] for smo in ltraj])
    ntraj = len(ltraj)
    random.shuffle(ltraj)
    ts = pd.concat([smo.trajff for smo in ltraj])
    print(list(ts))
    # hyperparameters used for the 25.02 submissions
    # parameters ={'feature_fraction': 0.837266468665352, 'learning_rate': 0.0013782873851139932, 'min_child_samples': 33, 'num_leaves': 4, 'reg_lambda': 5.725801055525217e-12, 'subsample': 0.4944846046759285}
    # parameters={k:[v] for k,v in parameters.items()}
    parameters = {
        'num_leaves': scipy.stats.randint(2, 11),
        'learning_rate': scipy.stats.loguniform(1e-4, 1e-2),
        'min_child_samples': scipy.stats.randint(10, 60),
        'subsample': scipy.stats.uniform(loc=0.3, scale=0.4),
        'reg_lambda': scipy.stats.loguniform(1e-14, 1e-10),
        'feature_fraction': scipy.stats.uniform(loc=0.7, scale=0.3),
    }
    print(parameters)
    lvar = [
        "error", "smoothedrawerror", "nb", "dt01", "countmeasure",
        "countmeasurecorrected", "baroAltitude"
    ] + [x for x in list(ts) if "density" in x] + [
        x for x in list(ts) if "speed" in x
    ] + [x for x in list(ts) if "curvature" in x]
    if args.latlon:
        lvar = lvar + ["smoothedlatitude", "smoothedlongitude"
                       ]  #"nnpredlatitude","nnpredlongitude"]
    if args.dbaro:
        lvar = lvar + ["dbaroAltitude"]
    # compute folds so that each aircraft is inside only one fold
    test_fold = np.concatenate([
        np.repeat(i // 30, smo.trajff.shape[0]) for i, smo in enumerate(ltraj)
    ])  #[keep]
    ps = PredefinedSplit(test_fold)
    print("number of folds", ps.get_n_splits())
    lsensors, X = makeX(ts, lvar)
    X = X
    y = makey(ts)
    model = MyLGBMClassifier(
        lsensors,
        feature_fraction=1,
        num_leaves=7,
        learning_rate=0.1,
        min_child_samples=10,
        subsample=1.,
        reg_lambda=0.) if args.classif else lgb.LGBMRegressor(
            n_estimators=4000,
            subsample_freq=10,
            random_state=0,
            n_jobs=1,
            objective='l2',
            importance_type='gain',
            max_bin=511)
    model = RandomizedSearchCV(model,
                               parameters,
                               cv=ps,
                               n_jobs=args.n_jobs,
                               verbose=1,
                               n_iter=args.n_iter,
                               random_state=0)
    # 3 dirty lines below... just close your eyes and skip it
    model.argslearnmodel = args
    model.lsensors = lsensors
    model.lvar = lvar
    model.fit(X, y)
    print(model.score(X, y))
    print(model.cv_results_)
    print(model.best_params_)
    print(model.best_score_)
    if args.outputfile != '':
        with open(args.outputfile, 'wb') as f:
            pickle.dump(model, f)
Esempio n. 12
0
def evaluate_on_target_systems(target_systems,
                               training_systems,
                               predictor,
                               pair_params,
                               kernel_params,
                               opt_params,
                               input_dir,
                               estimator,
                               feature_type,
                               n_jobs=1,
                               perc_for_training=100):
    """
    Task: Evaluate rank-correlation, accuracy, etc. by learning an order predictor using the given
          set of training systems and prediction on the given set of target systems.

          For the evaluation we use either a repeated random-split of the target systems' data
          (if less than 75 examples are provided for test) or a cross-validation (else). The
          hyper-paramters of the order predictor are optimized using a nested cross-validation.
          The routines for that can be found in the file 'model_selection_cls.py'.

          If desired (excl_mol_by_struct_only == True), the molecular structures from the test set
          are removed from the training based on their molecular structure, e.g. by comparison of
          their InChIs, _even_ if these structures have been measured with another than the
          target system, i.e., another chromatographic system.

          See also the paper for details on the evaluation strategy.

    :param target_systems: list of strings, containing the target systems

    :param training_systems: list of strings, containing the training systems

    :param predictor: list of string, containing the predictors / molecular features used for the
        model construction.

    :param pair_params: dictionary, containing the paramters used for the creation of
        the RankSVM learning pairs, e.g. minimum and maximum oder distance.

    :param kernel_params: dictionary, containing the parameters for the kernels and
        generally for handling the input features / predictors. See definition of the
        dictionary in the __main__ of file 'evaluation_scenario_cls.py'.

    :param opt_params: dictionary, containing the paramters controlling the hyper-paramter
        optimization, number of cross-validation splits, etc. See definition of the
        dictionary in the __main__ of file 'evaluation_scenario_cls.py'.

    :param input_dir: string, directory containing the input data, e.g., fingerprints and retention
        times.

    :param estimator: string, order predictor to use: either "ranksvm" or "svr".

    :param feature_type: string, feature type that is used for the RankSVM. Currently
        only 'difference' features are supported, i.e., \phi_j - \phi_i is used for
        the decision. If the estimator is not RankSVM, but e.g. Support Vector Regression,
        than tis parameter can be set to None and is ignored.

    :param n_jobs: integer, number of jobs used for the hyper-parameter estimation. The maximum number
        of used jobs, is the number of inner splits (cross-validation or random split)!

    :param perc_for_training: scalar, percentage of the target systems data, that is
        used for the training, e.g., selected by simple random sub-sampling. This value
        only effects the training process, of the target system is in the set of training
        systems.

    :return: tuple of pandas.DataFrame

        1) mapped_values: predicted order scores for each target system
            - corresponds to: w^\phi_i in the RankSVM case
            - corresponds to: the predicted retention time, in the SVR case
        2) correlations: rank correlations of the order scores for each target system
        3) accuracies: pairwise prediction accuracies for each target system
        4) simple_statistics: number of training and test examples, etc.
        5) grid_search_results: hyper-parameter scores for the different grid-parameters
        6) grid_search_best_params: hyper-parameter scores for the best grid-parameters

        NOTE: The returned results (except mapped_values and grid search results) are averages
              across the different random splits / crossvalidation folds and repetitions.
    """

    # Variables related to the number of random / cv splits, for inner (*_cv)
    # and outer fold (*_ncv).
    n_splits_shuffle = opt_params["n_splits_shuffle"]
    n_splits_nshuffle = opt_params["n_splits_nshuffle"]
    n_splits_cv = opt_params["n_splits_cv"]
    n_splits_ncv = opt_params["n_splits_ncv"]
    n_rep = opt_params["n_rep"]

    # Should molecules be excluded from the training, if their structure appears
    # in the test _even if_ they have been measured with another system than the
    # (current) target system:
    excl_mol_by_struct_only = opt_params["excl_mol_by_struct_only"]

    # Currently only 'slack_type == "on_pairs"' is supported.
    slack_type = opt_params["slack_type"]
    if slack_type != "on_pairs":
        raise ValueError("Invalid slack type: %s" % slack_type)

    # Should all possible pairs be used for the (inner) test split during the
    # parameter estimation, regardless of what are the settings for 'd_upper'
    # and 'd_lower'?
    all_pairs_for_test = opt_params["all_pairs_for_test"]

    if not estimator in ["ranksvm", "svr"]:
        raise ValueError("Invalid estimator: %s" % estimator)

    # RankSVM and SVR regularization parameter
    param_grid = {"C": opt_params["C"]}

    if estimator == "svr":
        # error-tube width of the SVR
        param_grid["epsilon"] = opt_params["epsilon"]

    # Molecule kernel
    if kernel_params["kernel"] == "linear":
        kernel = "linear"
    elif kernel_params["kernel"] in ["rbf", "gaussian"]:
        param_grid["gamma"] = kernel_params["gamma"]
        kernel = "rbf"
    elif kernel_params["kernel"] == "tanimoto":
        if estimator in ["ranksvm"]:
            kernel = tanimoto_kernel
        elif estimator in ["svr"]:
            kernel = tanimoto_kernel_mat
    elif kernel_params["kernel"] == "minmax":
        if estimator in ["ranksvm"]:
            kernel = minmax_kernel
        elif estimator in ["svr"]:
            kernel = minmax_kernel_mat
    else:
        raise ValueError("Invalid kernel: %s." % kernel_params["kernel"])

    if isinstance(target_systems, str):
        target_systems = [target_systems]
    if isinstance(training_systems, str):
        training_systems = [training_systems]
    all_systems = list(set(target_systems).union(training_systems))

    assert isinstance(target_systems, list) and isinstance(
        training_systems, list)

    n_target_systems = len(target_systems)
    n_training_systems = len(training_systems)

    print("Target systems (# = %d): %s" %
          (n_target_systems, ",".join(target_systems)))
    print("Training systems (# = %d): %s" %
          (n_training_systems, ",".join(training_systems)))

    ## Load the target and training systems into directories using (molecule, system)-keys
    ## and retention times respectively molecular features as values

    # If we use molecular descriptors, we need to scale the data, e.g. to [0, 1].
    if kernel_params["scaler"] == "noscaling":
        scaler = None
    elif kernel_params["scaler"] == "minmax":
        scaler = MinMaxScaler()
    elif kernel_params["scaler"] == "std":
        scaler = StandardScaler()
    elif kernel_params["scaler"] == "l2norm":
        scaler = Normalizer()
    else:
        raise ValueError("Invalid scaler for the molecular features: %s" %
                         kernel_params["scaler"])

    # Handle counting MACCS fingerprints
    if predictor[0] == "maccsCount_f2dcf0b3":
        predictor_c = ["maccs"]
        predictor_fn = "fps_maccs_count.csv"
    else:
        predictor_c = predictor
        predictor_fn = None

    d_rts, d_features, d_system_index = OrderedDict(), OrderedDict(
    ), OrderedDict()
    for k_sys, system in enumerate(all_systems):
        rts, data = load_data(input_dir,
                              system=system,
                              predictor=predictor_c,
                              pred_fn=predictor_fn)

        # Use (mol-id, system)-tupel as key
        keys = list(zip(rts.inchi.values, [system] * rts.shape[0]))

        # Values: retention time, features
        rts = rts.rt.values.reshape(-1, 1)
        data = data.drop("inchi", axis=1).values

        if kernel_params["poly_feature_exp"]:
            # If we use binary fingerprints, we can include some
            # interactions, e.g. x_1x_2, ...
            data = PolynomialFeatures(interaction_only=True,
                                      include_bias=False).fit_transform(data)

        # Make ordered directories
        d_rts[system], d_features[system] = OrderedDict(), OrderedDict()

        for i, key in enumerate(keys):
            d_rts[system][key] = rts[i, 0]
            d_features[system][key] = data[i, :]

        # Dictionary containing a unique numeric identifier for each system
        d_system_index[system] = k_sys

        if scaler is not None:
            if getattr(scaler, "partial_fit", None) is not None:
                # 'partial_fit' allows us to learn the parameters of the scaler
                # online. (great stuff :))
                scaler.partial_fit(data)
            else:
                # We have scaler at hand, that does not allow online fitting.
                # This probably means, that this is a scaler, that performs
                # the desired scaling for each example independently, e.g.
                # sklearn.preprocessing.Normalizer.
                pass

    for system in target_systems:
        print("Target set '%s' contains %d examples." %
              (system, len(d_rts[system])))

    # Collect all the data that is available for training.
    d_rts_training = join_dicts(d_rts, training_systems)
    d_features_training = join_dicts(d_features, training_systems)

    # (mol-id, system)-tuples used in the training set
    l_keys_training = list(d_features_training.keys())

    # Data frames storing the evaluation measures
    mapped_values = {
        target_system: DataFrame()
        for target_system in target_systems
    }
    accuracies, correlations, simple_statistics = DataFrame(), DataFrame(
    ), DataFrame()
    grid_search_results, grid_search_best_params = DataFrame(), DataFrame()

    for idx_system, target_system in enumerate(target_systems):
        print("Process target system: %s (%d/%d)." %
              (target_system, idx_system + 1, len(target_systems)))

        # (mol-id, system)-tuples in the target set
        l_keys_target = list(d_features[target_system].keys())

        for i_rep in range(n_rep):
            print("Repetition: %d/%d" % (i_rep + 1, n_rep))

            # Get a random subset of the training data
            l_keys_training_sub = sample_perc_from_list(l_keys_training,
                                                        tsystem=target_system,
                                                        perc=perc_for_training,
                                                        random_state=747 *
                                                        i_rep)
            print("Training set contains %d (%f%%) examples." %
                  (len(l_keys_training_sub),
                   100 * len(l_keys_training_sub) / len(l_keys_training)))
            for training_system in training_systems:
                n_train_sys_sub = sum(
                    np.array(list(zip(
                        *l_keys_training_sub))[1]) == training_system)
                n_train_sys = sum(
                    np.array(list(zip(
                        *l_keys_training))[1]) == training_system)
                print("\tSystem %s contributes %d (%f%%) examples." %
                      (training_system, n_train_sys_sub,
                       100 * n_train_sys_sub / n_train_sys))

            # Check whether the target system has any overlap with training system
            print("Outer validation split strategy: ", end="", flush=True)

            l_molids_training = list(zip(*l_keys_training_sub))[0]
            l_molids_target = list(zip(*l_keys_target))[0]

            if (excl_mol_by_struct_only and (len (set (l_molids_training) & set (l_molids_target)) == 0)) or \
                (not excl_mol_by_struct_only and (len (set (l_keys_training_sub) & set (l_keys_target)) == 0)):

                print(
                    "Predefined split:\n"
                    "\tTraining and target do not share molecular structures "
                    "(excl_mol_by_struct_only=%d)" % excl_mol_by_struct_only)
                cv_outer = PredefinedSplit(np.zeros(len(l_keys_target)))

            else:
                # Determine strategy for training / test splits
                if len(l_keys_target) < 75:
                    print("ShuffleSplit")
                    train_size = 0.75
                    cv_outer = ShuffleSplit(n_splits=n_splits_shuffle,
                                            train_size=train_size,
                                            test_size=(1 - train_size),
                                            random_state=320 * i_rep)
                else:
                    print("KFold")
                    cv_outer = KFold(n_splits=n_splits_cv,
                                     shuffle=True,
                                     random_state=320 * i_rep)

            # Performance evaluation using cross-validation / random splits
            for i_fold, (_,
                         test_set) in enumerate(cv_outer.split(l_keys_target)):
                print("Outer fold: %d/%d" %
                      (i_fold + 1, cv_outer.get_n_splits()))

                # (mol-id, system)-tuples in the test subset of the target set
                l_keys_target_test = [l_keys_target[idx] for idx in test_set]

                # Remove test subset of the target set from the training set.
                # NOTE: The training set might contain the whole target set.
                l_molids_target_test = list(zip(*l_keys_target_test))[0]
                if excl_mol_by_struct_only:
                    l_keys_training_train = [
                        key for key in l_keys_training_sub
                        if key[0] not in l_molids_target_test
                    ]
                else:
                    l_keys_training_train = [
                        key for key in l_keys_training_sub
                        if key not in l_keys_target_test
                    ]

                if isinstance(cv_outer, PredefinedSplit):
                    print("Shuffle pre-defined split.")

                    rs_old = np.random.get_state()
                    np.random.seed(320 * i_fold)

                    # If we use the pre-defined splits we need to shuffle by our self.
                    # In that way we prevent bias during the h-param estimation.
                    np.random.shuffle(
                        l_keys_training_train)  # Shuffle is done inplace

                    np.random.set_state(rs_old)

                l_molids_training_train = list(zip(*l_keys_training_train))[0]

                if excl_mol_by_struct_only:
                    assert (len(
                        set(l_molids_target_test)
                        & set(l_molids_training_train)) == 0)
                else:
                    assert (len(
                        set(l_keys_target_test)
                        & set(l_keys_training_train)) == 0)

                # Determine strategy for training / test splits (inner)
                print("Inner (h-param) validation split strategy: ",
                      end="",
                      flush=True)
                if len(l_keys_training_train) < 75:
                    print("GroupShuffleSplit")
                    train_size = 0.75
                    cv_inner = GroupShuffleSplit(n_splits=n_splits_nshuffle,
                                                 train_size=train_size,
                                                 test_size=(1 - train_size),
                                                 random_state=350 * i_fold *
                                                 i_rep)
                else:
                    print("GroupKFold")
                    cv_inner = GroupKFold(n_splits=n_splits_ncv)

                # Train the rankSVM: Find optimal set of hyper-parameters
                od_rts_training_train, od_features_training_train = OrderedDict(
                ), OrderedDict()
                for key in l_keys_training_train:
                    od_rts_training_train[key] = d_rts_training[key]
                    od_features_training_train[key] = d_features_training[key]

                start_time = time.time()

                if estimator == "ranksvm":
                    best_params, cv_results, n_train_pairs, ranking_model, _, _ = find_hparan_ranksvm(
                        estimator=KernelRankSVC(kernel=kernel,
                                                slack_type=slack_type,
                                                random_state=319 * i_fold *
                                                i_rep),
                        fold_score_aggregation="weighted_average",
                        X=od_features_training_train,
                        y=od_rts_training_train,
                        param_grid=param_grid,
                        cv=cv_inner,
                        pair_params=pair_params,
                        n_jobs=n_jobs,
                        scaler=scaler,
                        all_pairs_as_test=all_pairs_for_test)
                elif estimator == "svr":
                    best_params, cv_results, n_train_pairs, ranking_model = find_hparam_regression(
                        estimator=SVRPairwise(kernel=kernel),
                        X=od_features_training_train,
                        y=od_rts_training_train,
                        param_grid=param_grid,
                        cv=cv_inner,
                        n_jobs=n_jobs,
                        scaler=scaler)
                else:
                    raise ValueError("Invalid estimator: %s" % estimator)

                rtime_gcv = time.time() - start_time
                print("[find_hparam_*] %.3fsec" % rtime_gcv)

                # Store the grid-search statistics for further analyses
                grid_search_results_tmp = DataFrame(cv_results)
                grid_search_results_tmp["target_system"] = target_system
                grid_search_results_tmp["training_systems"] = ";".join(
                    training_systems)
                grid_search_results = grid_search_results.append(
                    grid_search_results_tmp)

                grid_search_best_params_tmp = DataFrame([best_params])
                grid_search_best_params_tmp["target_system"] = target_system
                grid_search_best_params_tmp["training_systems"] = ";".join(
                    training_systems)
                grid_search_best_params = grid_search_best_params.append(
                    grid_search_best_params_tmp)

                print(grid_search_best_params_tmp)

                ## Do prediction for the test set
                # Calculate: w' * \phi(x_i), for all molecules i
                X_test, rts_test = [], []

                for key in l_keys_target_test:
                    rts_test.append(d_rts[target_system][key])
                    X_test.append(d_features[target_system][key])

                rts_test = np.array(rts_test).reshape(-1, 1)
                X_test = np.array(X_test)

                if scaler is not None:
                    X_test = scaler.transform(X_test)

                if estimator == "ranksvm":
                    Y_pred_test = ranking_model.predict(X_test, X_test)
                elif estimator == "svr":
                    Y_pred_test = ranking_model.predict(X_test)
                else:
                    raise ValueError("Invalid estimator: %s" % estimator)

                wTx = ranking_model.map_values(X_test)

                mapped_values[target_system] = pd.concat([
                    mapped_values[target_system],
                    DataFrame({
                        "mapped_value": wTx,
                        "true_rt": rts_test.flatten(),
                        "inchi": l_molids_target_test
                    })
                ],
                                                         ignore_index=True)

                correlations = correlations.append(
                    {
                        "rank_corr": sp.stats.kendalltau(wTx, rts_test)[0],
                        "spear_corr": sp.stats.spearmanr(wTx, rts_test)[0],
                        "target_system": target_system,
                        "training_system": ";".join(training_systems)
                    },
                    ignore_index=True)

                n_train_mol = len(set(l_molids_training_train))
                n_test_mol = len(set(l_molids_target_test))
                n_shared_mol = len(
                    set(l_molids_target_test) & (set(l_molids_training_train)))
                p_shared_mol = float(n_shared_mol) / n_test_mol

                # Predict: x_i > x_j or x_i < x_j for all molecule pairs (i, j)
                with Timer("Get prediction score"):
                    for d_lower, d_upper in itertools.product(
                        [0] + list(range(1, 15, 2)),
                            2**np.array([0, 1, 2, 3, 4, 5, 6, np.inf])):
                        if d_lower > d_upper:
                            continue

                        pairs_test = get_pairs_single_system(rts_test,
                                                             d_lower=d_lower,
                                                             d_upper=d_upper)

                        accuracies = accuracies.append(
                            {
                                "score_w":
                                ranking_model.score_using_prediction(
                                    Y_pred_test, pairs_test, normalize=False),
                                "score":
                                ranking_model.score_using_prediction(
                                    Y_pred_test, pairs_test),
                                "n_pairs_test":
                                len(pairs_test),
                                "target_system":
                                target_system,
                                "training_system":
                                ";".join(training_systems),
                                "d_lower":
                                d_lower,
                                "d_upper":
                                d_upper,
                                "i_rep":
                                i_rep
                            },
                            ignore_index=True)

                        # Write out how many molecular structures are shared between the target and training systems
                        n_test_pairs = len(pairs_test)
                        simple_statistics = simple_statistics.append(
                            {
                                "n_shared_mol": n_shared_mol,
                                "p_shared_mol": p_shared_mol,
                                "n_train_mol": n_train_mol,
                                "n_test_mol": n_test_mol,
                                "n_train_pairs": n_train_pairs,
                                "n_test_pairs": n_test_pairs,
                                "grid_search_time": rtime_gcv,
                                "target_system": target_system,
                                "training_systems": ";".join(training_systems),
                                "d_lower": d_lower,
                                "d_upper": d_upper
                            },
                            ignore_index=True)

    # Average the mapped values over the repetitions
    for target_system in target_systems:
        mapped_values[target_system]["mapped_value_std"] = mapped_values[
            target_system]["mapped_value"]
        mapped_values[target_system] = mapped_values[target_system].groupby(
            ["inchi"], as_index=False).agg({
                "mapped_value": np.mean,
                "mapped_value_std": np.std,
                "true_rt": np.unique
            })

    # Aggregate the rows in 'correlations' to get the mean- and std-values across the folds.
    correlations["rank_corr_std"] = correlations["rank_corr"]
    correlations["spear_corr_std"] = correlations["spear_corr"]
    correlations = correlations.groupby(["target_system", "training_system"],
                                        as_index=False).agg({
                                            "rank_corr":
                                            np.mean,
                                            "rank_corr_std":
                                            np.std,
                                            "spear_corr":
                                            np.mean,
                                            "spear_corr_std":
                                            np.std
                                        })

    # Aggregate the rows in 'accuracies' to get the expected pairwise accuracy
    accuracies = accuracies.groupby(
        ["target_system", "training_system", "d_lower", "d_upper", "i_rep"],
        as_index=False).agg({
            "score_w": np.sum,
            "n_pairs_test": np.sum,
            "score": np.mean
        })
    accuracies["score_w"] = accuracies["score_w"] / accuracies["n_pairs_test"]
    accuracies.drop(["i_rep", "n_pairs_test"], axis=1, inplace=True)

    # Calculate expected accuracy across the repetitions
    accuracies["score_w_std"] = accuracies["score_w"]
    accuracies["score_std"] = accuracies["score"]
    accuracies = accuracies.groupby(
        ["target_system", "training_system", "d_lower", "d_upper"],
        as_index=False).agg({
            "score_w": np.mean,
            "score_w_std": np.std,
            "score": np.mean,
            "score_std": np.std
        })

    # Aggregate the simple statistics
    simple_statistics["n_shared_mol_std"] = simple_statistics["n_shared_mol"]
    simple_statistics["p_shared_mol_std"] = simple_statistics["p_shared_mol"]
    simple_statistics["n_train_mol_std"] = simple_statistics["n_train_mol"]
    simple_statistics["n_test_mol_std"] = simple_statistics["n_test_mol"]
    simple_statistics["n_train_pairs_std"] = simple_statistics["n_train_pairs"]
    simple_statistics["n_test_pairs_std"] = simple_statistics["n_test_pairs"]
    simple_statistics["grid_search_time_std"] = simple_statistics[
        "n_test_pairs"]

    simple_statistics = simple_statistics.groupby(
        ["target_system", "training_systems", "d_lower", "d_upper"],
        as_index=False).agg({
            "n_shared_mol": np.mean,
            "p_shared_mol": np.mean,
            "n_train_mol": np.mean,
            "n_test_mol": np.mean,
            "n_train_pairs": np.mean,
            "n_test_pairs": np.mean,
            "grid_search_time": np.mean,
            "n_shared_mol_std": np.std,
            "p_shared_mol_std": np.std,
            "n_train_mol_std": np.std,
            "n_test_mol_std": np.std,
            "n_train_pairs_std": np.std,
            "n_test_pairs_std": np.std,
            "grid_search_time_std": np.std
        })

    return mapped_values, correlations, accuracies, simple_statistics, grid_search_results, grid_search_best_params
# %% --------------------
import numpy as np
from sklearn.model_selection import PredefinedSplit

# %% --------------------
X = np.array([0, 1, 2, 3, 4])
y = np.array([0, 0, 1, 1, 1])

# %% --------------------
# For example, when using a validation set, set the test_fold to 0 for all samples that are part
# of the validation set, and to -1 for all other samples.
# test_fold = [2, 2, 1, 1, -1]
test_fold = np.append(np.full(4, -1), np.full(1, 0))

# %% --------------------
ps = PredefinedSplit(test_fold)

# %% --------------------
print(ps.get_n_splits())

# %% --------------------
print(ps)

# %% --------------------
for train_index, test_index in ps.split():
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

# %% --------------------
Esempio n. 14
0
print('Real:', Counter(y_train))
print('Over:', Counter(y_over))
print('Under:', Counter(y_under))
print('Balanced:', Counter(y_bal))

###############################################################################
## Create learning model (Decision Tree) and tune hyperparameters
###############################################################################
from sklearn.model_selection import PredefinedSplit
from sklearn.model_selection import GridSearchCV
from sklearn.tree import DecisionTreeClassifier
### -1 indices -> train
### 0  indices -> validation
test_fold = np.repeat([-1, 0], [X_train.shape[0], X_val.shape[0]])
myPreSplit = PredefinedSplit(test_fold)
myPreSplit.get_n_splits()
myPreSplit.split()
for train_index, test_index in myPreSplit.split():
    print("TRAIN:", train_index, "TEST:", test_index)

parameters = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [1, 10, 100, 1000, 10000, 100000, 1000000, None],
    'min_samples_split': [2, 3, 4]
}
clf = DecisionTreeClassifier()
model = GridSearchCV(estimator=clf,
                     param_grid=parameters,
                     scoring='f1_weighted',
                     cv=myPreSplit,
pipeline = pipeline = Pipeline([('dim_red',
                                 FunctionTransformer(validate=True)),
                                ('norm', FunctionTransformer(validate=True))])

X2 = pipeline.fit_transform(X1)

#%%
import numpy as np
from sklearn.model_selection import PredefinedSplit

X = np.array([[1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([0, 0, 1, 1])
test_fold = [-1, -1, -1, 0]
ps = PredefinedSplit(test_fold)
ps.get_n_splits()

print(ps)

for train_index, test_index in ps.split():
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

#%%
data = pd.DataFrame(
    data={
        'text_feat': [
            'This is my first sentence. I hope you like it',
            'Here is my second sentence. It is pretty similar.'
        ],
Esempio n. 16
0
    return float(cm[0][0] + cm[1][1] + cm[2][2]) / (sum(cm[0]) + sum(cm[1]) +
                                                    sum(cm[2]))


def sen(m):
    return m[0][0] / (sum(m[0]))


def ppv(m):
    return m[0][0] / (m[0][0] + m[1][0])


#my_scorer= make_scorer(myMCC)
ps = PredefinedSplit(fold_set)
#print(ps)
print("NUMBER OF K-FOLDS=", ps.get_n_splits())  #returns the number  of k-folds
print("\nThe lenght of the set X:", len(X), "y: ", len(y))
print("\nThe lenght of the fold set number:", len(fold_set))
print()
print(len(X))
#print(len(X[-1]))
#print("y_true")
print(len(y))
print(type(y[1]))

mySVC = SVC(C=2.0, kernel='rbf', gamma=0.5)  #build the model SVC
print()
print("TRAINING INITIALIZATION")
print()
y_pred = cross_val_predict(mySVC, X, y, cv=ps, n_jobs=2)
print()