def cross_validate_mechine(args: TrainArgs, logger: Logger = None):
    """k-fold cross validation"""
    info = logger.info if logger is not None else print

    # Initialize relevant variables
    init_seed = args.seed
    save_dir = args.save_dir

    # Run training on different random seeds for each fold
    dmpnn_scores = []
    for fold_num in range(args.num_folds):
        if args.dataset_type == 'classification':
            args.data_path = 'molnet_benchmark/molnet_random_'+args.protein+'_c/seed'+str(fold_num+1)+'/train.csv'
            args.separate_test_path = 'molnet_benchmark/molnet_random_'+args.protein+'_c/seed'+str(fold_num+1)+'/val.csv'
            args.separate_val_path = 'molnet_benchmark/molnet_random_'+args.protein+'_c/seed'+str(fold_num+1)+'/test.csv'
        elif args.dataset_type == 'regression':
            args.data_path = 'molnet_benchmark/molnet_random_'+args.protein+'_r/seed'+str(fold_num+1)+'/train.csv'
            args.separate_test_path = 'molnet_benchmark/molnet_random_'+args.protein+'_r/seed'+str(fold_num+1)+'/val.csv'
            args.separate_val_path = 'molnet_benchmark/molnet_random_'+args.protein+'_r/seed'+str(fold_num+1)+'/test.csv'
        info(f'Fold {fold_num}')
        args.seed = init_seed + fold_num
        args.save_dir = os.path.join(save_dir, f'fold_{fold_num}')
        makedirs(args.save_dir)
        model_scores,model,scaler,df = run_training(args, logger)
        if args.loss_save:
            df.to_csv('/home/cxw/python——work/paper_gcn/dmpnn_epoch_loss/'+args.protein+'loss.csv',index=None)
            # df.to_csv(args.protein+'loss.csv',index=None)
            break
        dmpnn_scores.append(model_scores)
        train_target, train_feature, val_target, val_feature, test_target, test_feature,train_smiles,val_smiles,test_smiles,test_preds = get_xgboost_feature(args, logger,model)
        train_target = pd.DataFrame(train_target)
        train_feature = pd.DataFrame(train_feature)
        val_target = pd.DataFrame(val_target)
        val_feature = pd.DataFrame(val_feature)
        test_target = pd.DataFrame(test_target)
        test_feature = pd.DataFrame(test_feature)
        train_morgan_feature = get_morgan_feature(train_smiles)
        val_morgan_feature = get_morgan_feature(val_smiles)
        test_morgan_feature = get_morgan_feature(test_smiles)
        if args.dataset_type == 'classification':
            if test_target.shape[1]==1:
                scores = svm_knn_rf_class(train_feature, train_target,val_feature, val_target,test_feature,test_target,
                     train_morgan_feature,val_morgan_feature,test_morgan_feature,test_preds)
            else:
                scores = svm_knn_rf_class_more(train_feature, train_target,val_feature, val_target,test_feature,test_target,
                     train_morgan_feature,val_morgan_feature,test_morgan_feature,test_preds)
            scores.columns = ['type','auc']
        elif args.dataset_type == 'regression':
            if test_target.shape[1]==1:
                scores = svm_knn_rf_regre(train_feature, train_target,val_feature, val_target,test_feature,test_target,
                     train_morgan_feature,val_morgan_feature,test_morgan_feature,test_preds)
            else:
                scores = svm_knn_rf_regre_more(train_feature, train_target,val_feature, val_target,test_feature,test_target,
                     train_morgan_feature,val_morgan_feature,test_morgan_feature,test_preds)
            scores.columns = ['type', 'RMSE']

    scores.to_csv(args.protein+'mechine_scores.csv')
Ejemplo n.º 2
0
def cross_validate(args: TrainArgs,
                   logger: Logger = None) -> Tuple[float, float]:
    """k-fold cross validation"""
    info = logger.info if logger is not None else print

    # Initialize relevant variables
    init_seed = args.seed
    save_dir = args.save_dir
    task_names = args.target_columns or get_task_names(args.data_path)

    # Run training on different random seeds for each fold
    dmpnn_scores = []
    dmpnn_xgb_scores = []
    morgan_scores = []
    dmpnn_morgan_scores = []
    for fold_num in range(args.num_folds):
        if args.dataset_type == 'classification':
            args.data_path = 'molnet_benchmark/molnet_random_' + args.protein + '_c/seed' + str(
                fold_num + 1) + '/train.csv'
            args.separate_test_path = 'molnet_benchmark/molnet_random_' + args.protein + '_c/seed' + str(
                fold_num + 1) + '/val.csv'
            args.separate_val_path = 'molnet_benchmark/molnet_random_' + args.protein + '_c/seed' + str(
                fold_num + 1) + '/test.csv'
        elif args.dataset_type == 'regression':
            args.data_path = 'molnet_benchmark/molnet_random_' + args.protein + '_r/seed' + str(
                fold_num + 1) + '/train.csv'
            args.separate_test_path = 'molnet_benchmark/molnet_random_' + args.protein + '_r/seed' + str(
                fold_num + 1) + '/val.csv'
            args.separate_val_path = 'molnet_benchmark/molnet_random_' + args.protein + '_r/seed' + str(
                fold_num + 1) + '/test.csv'
        info(f'Fold {fold_num}')
        args.seed = init_seed + fold_num
        args.save_dir = os.path.join(save_dir, f'fold_{fold_num}')
        makedirs(args.save_dir)
        model_scores, model, scaler = run_training(args, logger)
        dmpnn_scores.append(model_scores)
        train_target, train_feature, val_target, val_feature, test_target, test_feature, train_smiles, val_smiles, test_smiles = get_xgboost_feature(
            args, logger, model)
        train_target = pd.DataFrame(train_target)
        train_feature = pd.DataFrame(train_feature)
        val_target = pd.DataFrame(val_target)
        val_feature = pd.DataFrame(val_feature)
        test_target = pd.DataFrame(test_target)
        test_feature = pd.DataFrame(test_feature)
        train_morgan_feature = get_morgan_feature(train_smiles)
        val_morgan_feature = get_morgan_feature(val_smiles)
        test_morgan_feature = get_morgan_feature(test_smiles)
        if args.dataset_type == 'classification':
            if test_target.shape[1] == 1:
                xgb_gbc = xgb.XGBClassifier(base_score=0.5,
                                            booster='gbtree',
                                            colsample_bylevel=1,
                                            colsample_bytree=1,
                                            gamma=1,
                                            learning_rate=0.1,
                                            max_delta_step=0,
                                            max_depth=4,
                                            min_child_weight=8,
                                            missing=None,
                                            n_estimators=2000,
                                            n_jobs=1,
                                            nthread=None,
                                            objective='binary:logistic',
                                            random_state=0,
                                            reg_alpha=0,
                                            reg_lambda=1,
                                            scale_pos_weight=1,
                                            seed=None,
                                            silent=True,
                                            subsample=0.8,
                                            tree_method='gpu_hist',
                                            n_gpus=-1)
                xgb_gbc.fit(train_feature,
                            train_target,
                            eval_set=[(val_feature, val_target)],
                            eval_metric='auc',
                            early_stopping_rounds=200)
                pre_pro = xgb_gbc.predict_proba(test_feature)[:, 1]
                fpr, tpr, threshold = roc_curve(test_target, pre_pro)
                AUC = auc(fpr, tpr)
                pre_pro = [1 if i > 0.5 else 0 for i in pre_pro]
                tn, fp, fn, tp = confusion_matrix(test_target, pre_pro).ravel()
                # Sn = TP /(TP + FN)  Sp = TN / (TN+FP)
                Sn = tp / (tp + fn)
                Sp = tn / (tn + fp)
                acc = accuracy_score(test_target, pre_pro)
                dmpnn_xgb_scores.append([AUC, Sn, Sp, acc])
                joblib.dump(xgb_gbc, 'external_test/dmpnn_xgb.model')
                xgb_gbc = xgb.XGBClassifier(base_score=0.5,
                                            booster='gbtree',
                                            colsample_bylevel=1,
                                            colsample_bytree=1,
                                            gamma=1,
                                            learning_rate=0.1,
                                            max_delta_step=0,
                                            max_depth=4,
                                            min_child_weight=8,
                                            missing=None,
                                            n_estimators=2000,
                                            n_jobs=1,
                                            nthread=None,
                                            objective='binary:logistic',
                                            random_state=0,
                                            reg_alpha=0,
                                            reg_lambda=1,
                                            scale_pos_weight=1,
                                            seed=None,
                                            silent=True,
                                            subsample=0.8,
                                            tree_method='gpu_hist',
                                            n_gpus=-1)
                xgb_gbc.fit(train_morgan_feature,
                            train_target,
                            eval_set=[(val_morgan_feature, val_target)],
                            eval_metric='auc',
                            early_stopping_rounds=200)
                pre_pro = xgb_gbc.predict_proba(test_morgan_feature)[:, 1]
                fpr, tpr, threshold = roc_curve(test_target, pre_pro)
                AUC = auc(fpr, tpr)
                pre_pro = [1 if i > 0.5 else 0 for i in pre_pro]
                tn, fp, fn, tp = confusion_matrix(test_target, pre_pro).ravel()
                # Sn = TP /(TP + FN)  Sp = TN / (TN+FP)
                Sn = tp / (tp + fn)
                Sp = tn / (tn + fp)
                acc = accuracy_score(test_target, pre_pro)
                morgan_scores.append([AUC, Sn, Sp, acc])
                joblib.dump(xgb_gbc, 'external_test/morgan_xgb.model')
                train_gcn_mor_feature = pd.concat(
                    [train_feature, train_morgan_feature], axis=1)
                val_gcn_mor_feature = pd.concat(
                    [val_feature, val_morgan_feature], axis=1)
                test_gcn_mor_feature = pd.concat(
                    [test_feature, test_morgan_feature], axis=1)
                train_gcn_mor_feature.columns = val_gcn_mor_feature.columns = test_gcn_mor_feature.columns = range(
                    train_gcn_mor_feature.shape[1])
                xgb_gbc = xgb.XGBClassifier(base_score=0.5,
                                            booster='gbtree',
                                            colsample_bylevel=1,
                                            colsample_bytree=1,
                                            gamma=1,
                                            learning_rate=0.1,
                                            max_delta_step=0,
                                            max_depth=4,
                                            min_child_weight=8,
                                            missing=None,
                                            n_estimators=2000,
                                            n_jobs=1,
                                            nthread=None,
                                            objective='binary:logistic',
                                            random_state=0,
                                            reg_alpha=0,
                                            reg_lambda=1,
                                            scale_pos_weight=1,
                                            seed=None,
                                            silent=True,
                                            subsample=0.8,
                                            tree_method='gpu_hist',
                                            n_gpus=-1)
                xgb_gbc.fit(train_gcn_mor_feature,
                            train_target,
                            eval_set=[(val_gcn_mor_feature, val_target)],
                            eval_metric='auc',
                            early_stopping_rounds=200)
                pre_pro = xgb_gbc.predict_proba(test_gcn_mor_feature)[:, 1]
                fpr, tpr, threshold = roc_curve(test_target, pre_pro)
                AUC = auc(fpr, tpr)
                pre_pro = [1 if i > 0.5 else 0 for i in pre_pro]
                tn, fp, fn, tp = confusion_matrix(test_target, pre_pro).ravel()
                Sn = tp / (tp + fn)
                Sp = tn / (tn + fp)
                acc = accuracy_score(test_target, pre_pro)
                dmpnn_morgan_scores.append([AUC, Sn, Sp, acc])
                joblib.dump(xgb_gbc, 'external_test/dmpnn_morgan_xgb.model')

            else:
                aucs = []
                for i in range(test_target.shape[1]):
                    xgb_gbc = xgb.XGBClassifier(base_score=0.5,
                                                booster='gbtree',
                                                colsample_bylevel=1,
                                                colsample_bytree=1,
                                                gamma=1,
                                                learning_rate=0.1,
                                                max_delta_step=0,
                                                max_depth=4,
                                                min_child_weight=8,
                                                missing=None,
                                                n_estimators=2000,
                                                n_jobs=1,
                                                nthread=None,
                                                objective='binary:logistic',
                                                random_state=0,
                                                reg_alpha=0,
                                                reg_lambda=1,
                                                scale_pos_weight=1,
                                                seed=None,
                                                silent=True,
                                                subsample=0.8,
                                                tree_method='gpu_hist',
                                                n_gpus=-1)
                    if max(val_target[i]) == 0 or max(
                            train_target[i]) == 0 or max(test_target[i]) == 0:
                        continue
                    xgb_gbc.fit(train_feature,
                                train_target[i],
                                eval_set=[(val_feature, val_target[i])],
                                eval_metric='auc',
                                early_stopping_rounds=100)
                    pre_pro = xgb_gbc.predict_proba(test_feature)[:, 1]
                    fpr, tpr, threshold = roc_curve(test_target[i], pre_pro)
                    AUC = auc(fpr, tpr)
                    if args.metric == "prc-auc":
                        precision, recall, _ = precision_recall_curve(
                            test_target[i], pre_pro)
                        AUC = auc(recall, precision)
                    pre_pro = [1 if i > 0.5 else 0 for i in pre_pro]
                    tn, fp, fn, tp = confusion_matrix(test_target,
                                                      pre_pro).ravel()
                    Sn = tp / (tp + fn)
                    Sp = tn / (tn + fp)
                    acc = accuracy_score(test_target, pre_pro)
                    aucs.append([AUC, Sn, Sp, acc])
                dmpnn_xgb_scores.append([np.mean(aucs)])
        elif args.dataset_type == 'regression':
            if test_target.shape[1] == 1:
                xgb_gbc = xgb.XGBRegressor(learn_rate=0.1,
                                           max_depth=4,
                                           min_child_weight=10,
                                           gamma=1,
                                           subsample=0.8,
                                           colsample_bytree=0.8,
                                           reg_alpha=0.8,
                                           objective='reg:linear',
                                           n_estimators=2000,
                                           tree_method='gpu_hist',
                                           n_gpus=-1)
                xgb_gbc.fit(train_feature,
                            train_target,
                            eval_set=[(val_feature, val_target)],
                            eval_metric='rmse',
                            early_stopping_rounds=200)
                y_pred = xgb_gbc.predict(test_feature)
                y_pred = scaler.inverse_transform(y_pred)
                y_test = test_target.astype('float')
                MSE = mean_squared_error(y_test, y_pred)
                RMSE = MSE**0.5
                MAE = median_absolute_error(y_test, y_pred)
                dmpnn_xgb_scores.append([RMSE, MAE])
                joblib.dump(xgb_gbc, 'external_test/dmpnn_xgb.model')
                xgb_gbc = xgb.XGBRegressor(learn_rate=0.1,
                                           max_depth=4,
                                           min_child_weight=10,
                                           gamma=1,
                                           subsample=0.8,
                                           colsample_bytree=0.8,
                                           reg_alpha=0.8,
                                           objective='reg:linear',
                                           n_estimators=2000,
                                           tree_method='gpu_hist',
                                           n_gpus=-1)
                xgb_gbc.fit(train_morgan_feature,
                            train_target,
                            eval_set=[(val_morgan_feature, val_target)],
                            eval_metric='rmse',
                            early_stopping_rounds=200)
                y_pred = xgb_gbc.predict(test_morgan_feature)
                y_pred = scaler.inverse_transform(y_pred)
                MSE = mean_squared_error(y_test, y_pred)
                RMSE = MSE**0.5
                MAE = median_absolute_error(y_test, y_pred)
                morgan_scores.append([RMSE, MAE])
                joblib.dump(xgb_gbc, 'external_test/morgan_xgb.model')
                train_gcn_mor_feature = pd.concat(
                    [train_feature, train_morgan_feature], axis=1)
                val_gcn_mor_feature = pd.concat(
                    [val_feature, val_morgan_feature], axis=1)
                test_gcn_mor_feature = pd.concat(
                    [test_feature, test_morgan_feature], axis=1)
                train_gcn_mor_feature.columns = val_gcn_mor_feature.columns = test_gcn_mor_feature.columns = range(
                    train_gcn_mor_feature.shape[1])

                xgb_gbc = xgb.XGBRegressor(learn_rate=0.1,
                                           max_depth=4,
                                           min_child_weight=10,
                                           gamma=1,
                                           subsample=0.8,
                                           colsample_bytree=0.8,
                                           reg_alpha=0.8,
                                           objective='reg:linear',
                                           n_estimators=2000,
                                           tree_method='gpu_hist',
                                           n_gpus=-1)
                xgb_gbc.fit(train_gcn_mor_feature,
                            train_target,
                            eval_set=[(val_gcn_mor_feature, val_target)],
                            eval_metric='rmse',
                            early_stopping_rounds=200)
                y_pred = xgb_gbc.predict(test_gcn_mor_feature)
                y_pred = scaler.inverse_transform(y_pred)
                MSE = mean_squared_error(y_test, y_pred)
                RMSE = MSE**0.5
                MAE = median_absolute_error(y_test, y_pred)
                dmpnn_morgan_scores.append([RMSE, MAE])
                joblib.dump(xgb_gbc, 'external_test/dmpnn_morgan_xgb.model')

            else:
                MAEs = []
                for i in range(test_target.shape[1]):
                    xgb_gbc = xgb.XGBRegressor(learn_rate=0.1,
                                               max_depth=4,
                                               min_child_weight=10,
                                               gamma=1,
                                               subsample=0.8,
                                               colsample_bytree=0.8,
                                               reg_alpha=0.8,
                                               objective='reg:linear',
                                               n_estimators=2000,
                                               tree_method='gpu_hist',
                                               n_gpus=-1)
                    xgb_gbc.fit(train_feature,
                                train_target[i],
                                eval_set=[(val_feature, val_target[i])],
                                eval_metric='rmse',
                                early_stopping_rounds=200)
                    y_pred = xgb_gbc.predict(test_feature)
                    y_test = test_target[i].astype('float')
                    MSE = mean_squared_error(y_test, y_pred)
                    RMSE = MSE**0.5
                    MAE = median_absolute_error(y_test, y_pred)
                    MAEs.append([MAE, RMSE])
                dmpnn_xgb_scores.append([np.mean(MAEs)])

    dmpnn_scores = np.array(dmpnn_scores)
    # Report scores across models
    dmpnn_scores = np.nanmean(
        dmpnn_scores, axis=1)  # average score for each model across tasks
    dmpnn_mean_score, dmpnn_std_score = np.nanmean(dmpnn_scores), np.nanstd(
        dmpnn_scores)
    print('three dmpnn test = ', dmpnn_scores)
    info(
        f'Overall dmpnn test {args.metric} = {dmpnn_mean_score:.6f} +/- {dmpnn_std_score:.6f}'
    )

    dmpnn_xgb_scores = np.nanmean(
        dmpnn_xgb_scores, axis=1)  # average score for each model across tasks
    dmpnn_xgb_mean_score, dmpnn_xgb_std_score = np.nanmean(
        dmpnn_xgb_scores), np.nanstd(dmpnn_xgb_scores)
    print('three dmpnn_xgb_test = ', dmpnn_xgb_scores)
    info(
        f'Overall dmpnn_xgb_test {args.metric} = {dmpnn_xgb_mean_score:.6f} +/- {dmpnn_xgb_std_score:.6f}'
    )

    morgan_scores = np.nanmean(
        morgan_scores, axis=1)  # average score for each model across tasks
    morgan_mean_score, morgan_std_score = np.nanmean(morgan_scores), np.nanstd(
        morgan_scores)
    print('three morgen_test = ', morgan_scores)
    info(
        f'Overall morgen_test {args.metric} = {morgan_mean_score:.6f} +/- {morgan_std_score:.6f}'
    )

    dmpnn_morgan_scores = np.nanmean(
        dmpnn_morgan_scores,
        axis=1)  # average score for each model across tasks
    dmpnn_morgan_mean_score, dmpnn_morgan_std_score = np.nanmean(
        dmpnn_morgan_scores), np.nanstd(dmpnn_morgan_scores)
    print('three dmpnn_morgan_scores = ', dmpnn_morgan_scores)
    info(
        f'Overall dmpnn_morgen_test {args.metric} = {dmpnn_morgan_mean_score:.6f} +/- {dmpnn_morgan_std_score:.6f}'
    )
    return model
def cross_validate(args: TrainArgs, logger: Logger = None) -> Tuple[float, float]:
    """k-fold cross validation"""
    info = logger.info if logger is not None else print

    # Initialize relevant variables
    init_seed = args.seed
    save_dir = args.save_dir

    # Run training on different random seeds for each fold
    dmpnn_scores = []
    scores_df = pd.DataFrame()
    for fold_num in range(args.num_folds):
        if args.dataset_type == 'classification':
            args.data_path = 'molnet_benchmark/molnet_random_'+args.protein+'_c/seed'+str(fold_num+1)+'/train.csv'
            args.separate_test_path = 'molnet_benchmark/molnet_random_'+args.protein+'_c/seed'+str(fold_num+1)+'/val.csv'
            args.separate_val_path = 'molnet_benchmark/molnet_random_'+args.protein+'_c/seed'+str(fold_num+1)+'/test.csv'
        elif args.dataset_type == 'regression':
            args.data_path = 'molnet_benchmark/molnet_random_'+args.protein+'_r/seed'+str(fold_num+1)+'/train.csv'
            args.separate_test_path = 'molnet_benchmark/molnet_random_'+args.protein+'_r/seed'+str(fold_num+1)+'/val.csv'
            args.separate_val_path = 'molnet_benchmark/molnet_random_'+args.protein+'_r/seed'+str(fold_num+1)+'/test.csv'
        info(f'Fold {fold_num}')
        args.seed = init_seed + fold_num
        args.save_dir = os.path.join(save_dir, f'fold_{fold_num}')
        makedirs(args.save_dir)
        model_scores,model,scaler,df = run_training(args, logger)
        if args.loss_save:
            # df.to_csv('/home/cxw/python_work/paper_gcn/dmpnn_epoch_loss/'+args.protein+'_loss.csv',index=None)
            # df.to_csv(args.protein+'loss.csv',index=None)
            # break
        dmpnn_scores.append(model_scores)
        train_target, train_feature, val_target, val_feature, test_target, test_feature,train_smiles,val_smiles,test_smiles,test_preds = get_xgboost_feature(args, logger,model)
        train_target = pd.DataFrame(train_target)
        train_feature = pd.DataFrame(train_feature)
        val_target = pd.DataFrame(val_target)
        val_feature = pd.DataFrame(val_feature)
        test_target = pd.DataFrame(test_target)
        test_feature = pd.DataFrame(test_feature)
        train_morgan_feature = get_morgan_feature(train_smiles)
        val_morgan_feature = get_morgan_feature(val_smiles)
        test_morgan_feature = get_morgan_feature(test_smiles)
        max_depth_numbers = [2,4,6,8,10]
        learning_rate_numbers = [0.01,0.05,0.1,0.15,0.2]
        min_child_weight_numbers = [2,4,6,8,10]
        if args.dataset_type == 'classification':
            if test_target.shape[1]==1:
                scores = xgboost_cv(max_depth_numbers,learning_rate_numbers,min_child_weight_numbers,
                                       train_feature, train_target,val_feature, val_target,test_feature,test_target,
                                       train_morgan_feature,val_morgan_feature,test_morgan_feature,test_preds)
            else:
                scores = xgb_cv_more(max_depth_numbers,learning_rate_numbers,min_child_weight_numbers,
                                       train_feature, train_target,val_feature, val_target,test_feature,test_target,
                                       train_morgan_feature,val_morgan_feature,test_morgan_feature,test_preds)
            scores.columns = ['type','max_depth','learning_rate','min_child_weight','auc','sn','sp','acc']
            scores_df = pd.concat([scores_df,scores])
        elif args.dataset_type == 'regression':
            if test_target.shape[1]==1:
                scores = xgb_regre_cv(max_depth_numbers, learning_rate_numbers, min_child_weight_numbers,
                                         train_feature, train_target, val_feature, val_target, test_feature, test_target,
                                         train_morgan_feature, val_morgan_feature, test_morgan_feature, test_preds, scaler)
            else:
                scores = xgb_regre_more(max_depth_numbers, learning_rate_numbers, min_child_weight_numbers,
                                         train_feature, train_target, val_feature, val_target, test_feature, test_target,
                                         train_morgan_feature, val_morgan_feature, test_morgan_feature, test_preds, scaler)
            scores.columns = ['type', 'max_depth', 'learning_rate', 'min_child_weight', 'RMSE']
            scores_df = pd.concat([scores_df,scores])

    df_groupby = scores_df.groupby(['type', 'max_depth', 'learning_rate', 'min_child_weight']).mean()
    df_groupby.to_csv(args.protein+'_scores.csv')

    return model


def cross_validate_mechine(args: TrainArgs, logger: Logger = None):
    """k-fold cross validation"""
    info = logger.info if logger is not None else print

    # Initialize relevant variables
    init_seed = args.seed
    save_dir = args.save_dir

    # Run training on different random seeds for each fold
    dmpnn_scores = []
    for fold_num in range(args.num_folds):
        if args.dataset_type == 'classification':
            args.data_path = 'molnet_benchmark/molnet_random_'+args.protein+'_c/seed'+str(fold_num+1)+'/train.csv'
            args.separate_test_path = 'molnet_benchmark/molnet_random_'+args.protein+'_c/seed'+str(fold_num+1)+'/val.csv'
            args.separate_val_path = 'molnet_benchmark/molnet_random_'+args.protein+'_c/seed'+str(fold_num+1)+'/test.csv'
        elif args.dataset_type == 'regression':
            args.data_path = 'molnet_benchmark/molnet_random_'+args.protein+'_r/seed'+str(fold_num+1)+'/train.csv'
            args.separate_test_path = 'molnet_benchmark/molnet_random_'+args.protein+'_r/seed'+str(fold_num+1)+'/val.csv'
            args.separate_val_path = 'molnet_benchmark/molnet_random_'+args.protein+'_r/seed'+str(fold_num+1)+'/test.csv'
        info(f'Fold {fold_num}')
        args.seed = init_seed + fold_num
        args.save_dir = os.path.join(save_dir, f'fold_{fold_num}')
        makedirs(args.save_dir)
        model_scores,model,scaler,df = run_training(args, logger)
        if args.loss_save:
            df.to_csv('/home/cxw/python——work/paper_gcn/dmpnn_epoch_loss/'+args.protein+'loss.csv',index=None)
            # df.to_csv(args.protein+'loss.csv',index=None)
            break
        dmpnn_scores.append(model_scores)
        train_target, train_feature, val_target, val_feature, test_target, test_feature,train_smiles,val_smiles,test_smiles,test_preds = get_xgboost_feature(args, logger,model)
        train_target = pd.DataFrame(train_target)
        train_feature = pd.DataFrame(train_feature)
        val_target = pd.DataFrame(val_target)
        val_feature = pd.DataFrame(val_feature)
        test_target = pd.DataFrame(test_target)
        test_feature = pd.DataFrame(test_feature)
        train_morgan_feature = get_morgan_feature(train_smiles)
        val_morgan_feature = get_morgan_feature(val_smiles)
        test_morgan_feature = get_morgan_feature(test_smiles)
        if args.dataset_type == 'classification':
            if test_target.shape[1]==1:
                scores = svm_knn_rf_class(train_feature, train_target,val_feature, val_target,test_feature,test_target,
                     train_morgan_feature,val_morgan_feature,test_morgan_feature,test_preds)
            else:
                scores = svm_knn_rf_class_more(train_feature, train_target,val_feature, val_target,test_feature,test_target,
                     train_morgan_feature,val_morgan_feature,test_morgan_feature,test_preds)
            scores.columns = ['type','auc']
        elif args.dataset_type == 'regression':
            if test_target.shape[1]==1:
                scores = svm_knn_rf_regre(train_feature, train_target,val_feature, val_target,test_feature,test_target,
                     train_morgan_feature,val_morgan_feature,test_morgan_feature,test_preds)
            else:
                scores = svm_knn_rf_regre_more(train_feature, train_target,val_feature, val_target,test_feature,test_target,
                     train_morgan_feature,val_morgan_feature,test_morgan_feature,test_preds)
            scores.columns = ['type', 'RMSE']

    scores.to_csv(args.protein+'mechine_scores.csv')