def run_cv():
    factory = SimulatedData(
        num_var=200,
        pos = [0, 0, 0, 0],
        neg=[0, 0, 0, 0],
        neut=[100,0,0,100]
    )

    dataset = factory.generate_black_white_data('PanGyn-DFI-5-base.mat',
                                                white_alive=130, white_dead=130, black_alive=130, black_dead=130)
    dataset_w = get_one_race_clf(dataset, 'WHITE')
    dataset_b = get_one_race_clf(dataset, 'BLACK')
    dataset_tl = [e for e in dataset]
    dataset_tl[0] = preprocessing.normalize(dataset_tl[0])

    k = -1
    X, Y, R, y_sub, y_strat = dataset
    df = pd.DataFrame(y_strat, columns=['RY'])
    df['R'] = R
    df['Y'] = Y
    print(X.shape)
    print(df['RY'].value_counts())
    print(df['R'].value_counts())
    print(df['Y'].value_counts())

    parametrs_mix = {'fold': 3, 'k': k, 'val_size': 0.0, 'batch_size': 20, 'momentum': 0.9,
                     'learning_rate': 0.01, 'lr_decay': 0.0, 'dropout': 0.5,
                     'L1_reg': 0.001, 'L2_reg': 0.001, 'hiddenLayers': [128, 64]}
    parametrs_w = {'fold': 3, 'k': k, 'val_size':0.0, 'batch_size':20,
                     'learning_rate':0.01, 'lr_decay':0.0, 'dropout':0.5,
                     'L1_reg': 0.001, 'L2_reg': 0.001, 'hiddenLayers': [128, 64]}

    parametrs_b = {'fold': 3, 'k': k, 'val_size': 0.0, 'batch_size': 20,
                   'learning_rate': 0.01, 'lr_decay': 0.0, 'dropout': 0.5,
                   'L1_reg': 0.001, 'L2_reg': 0.001, 'hiddenLayers': [128, 64]}

    parametrs_tl = {'fold': 3, 'k': k, 'val_size': 0.0, 'batch_size': 32, 'train_epoch': 100, 'tune_epoch': 100,
                    'learning_rate': 0.01, 'lr_decay': 0.0, 'dropout': 0.5, 'tune_lr': 0.001,
                    'L1_reg': 0.001, 'L2_reg': 0.001, 'hiddenLayers': [128, 64], 'tune_batch': 32}

    res = pd.DataFrame()
    for i in range(20):
        seed = i
        df_m = run_mixture_cv(seed, dataset, **parametrs_mix)
        df_w = run_one_race_cv(seed, dataset_w, **parametrs_w)
        df_w = df_w.rename(columns={"Auc": "W_ind"})
        df_b = run_one_race_cv(seed, dataset_b, **parametrs_b)
        df_b = df_b.rename(columns={"Auc": "B_ind"})
        df_tl = run_supervised_transfer_cv(seed, dataset, **parametrs_tl)
        df1 = pd.concat([df_m, df_w['W_ind'], df_b['B_ind'], df_tl['TL_Auc']],
                        sort=False, axis=1)
        print (df1)
        res = res.append(df1)

    print (res)
Exemple #2
0
def run_cv(cancer_type, feature_type, target, years=3, groups=("WHITE", "BLACK")):

    print (cancer_type, feature_type, target, years)
    # dataset = read_data(cancer_type, feature_type[0], target, years)
    dataset = get_dataset(cancer_type=cancer_type, feature_type=feature_type, target=target, groups=("WHITE", "BLACK"))
    dataset = standarize_dataset(dataset)
    dataset_w = get_one_race(dataset, groups[0])
    dataset_w = get_n_years(dataset_w, years)
    dataset_b = get_one_race(dataset, groups[1])
    dataset_b = get_n_years(dataset_b, years)
    dataset = get_n_years(dataset, years)

    k = 200 if 'mRNA' in feature_type else -1
    X, Y, R, y_sub, y_strat = dataset
    df = pd.DataFrame(y_strat, columns=['RY'])
    df['R'] = R
    df['Y'] = Y
    Dict = df['RY'].value_counts()
    Dict = dict(Dict)
    print (Dict)

    parametrs_mix = {'fold': 3, 'k': k, 'val_size':0.0, 'batch_size':20,'momentum':0.9,
                     'learning_rate':0.01, 'lr_decay':0.03, 'dropout':0.5,
                     'L1_reg': 0.001, 'L2_reg': 0.001, 'hiddenLayers': [128, 64]}
    parametrs_w = {'fold': 3, 'k': k, 'val_size':0.0, 'batch_size':20,
                     'learning_rate':0.01, 'lr_decay':0.0, 'dropout':0.5,
                     'L1_reg': 0.001, 'L2_reg': 0.001, 'hiddenLayers': [128, 64]}

    parametrs_b = {'fold': 3, 'k': k, 'val_size':0.0, 'batch_size':4,
                     'learning_rate':0.01, 'lr_decay':0.0, 'dropout':0.5,
                     'L1_reg': 0.001, 'L2_reg': 0.001, 'hiddenLayers': [128, 64]}

    parametrs_tl = {'fold': 3, 'k': k, 'val_size':0.0, 'batch_size':32, 'tune_epoch':100, 'train_epoch':100,
                     'learning_rate':0.01, 'lr_decay':0.0, 'dropout':0.5, 'tune_lr':0.01,
                     'L1_reg': 0.001, 'L2_reg': 0.001, 'hiddenLayers': [128, 64], 'tune_batch':10}

    res = pd.DataFrame()
    for i in range(20):
        seed = i
        df_m = run_mixture_cv(seed, dataset, **parametrs_mix, groups=groups)
        df_w = run_one_race_cv(seed, dataset_w, **parametrs_w)
        df_w = df_w.rename(columns={"Auc": "W_ind"})
        df_b = run_one_race_cv(seed, dataset_b, **parametrs_b)
        df_b = df_b.rename(columns={"Auc": "A_ind"})
        df_tl = run_supervised_transfer_cv(seed, dataset, **parametrs_tl, groups=groups)
        df1 = pd.concat([df_m, df_w['W_ind'], df_b['A_ind'], df_tl['TL_Auc']],
                        sort=False, axis=1)

        print (df1)
        res = res.append(df1)

    f_name = 'Result/' + cancer_type + '-AA-EA-' + feature_type[0] + '-' + target + '-' + str(years) + 'YR.xlsx'
    res.to_excel(f_name)
Exemple #3
0
def run_cv():
    dataset = read_data('MMRF', 'mRNA', 'OS', 3)
    dataset_w = get_one_race(dataset, 'WHITE')
    dataset_w = get_n_years(dataset_w, 3)
    dataset_b = get_one_race(dataset, 'BLACK')
    dataset_b = get_n_years(dataset_b, 3)
    dataset = get_n_years(dataset, 3)
    X, Y, R, y_sub, y_strat = dataset
    df = pd.DataFrame(y_strat, columns=['RY'])
    df['R'] = R
    df['Y'] = Y
    print(X.shape)
    print(df['RY'].value_counts())
    print(df['R'].value_counts())
    print(df['Y'].value_counts())

    k = -1
    parametrs_b = {
        'fold': 3,
        'k': k,
        'val_size': 0.0,
        'batch_size': 4,
        'learning_rate': 0.01,
        'lr_decay': 0.0,
        'dropout': 0.5,
        'L1_reg': 0.001,
        'L2_reg': 0.001,
        'hiddenLayers': [128, 64]
    }

    res = pd.DataFrame()
    for i in range(20):
        seed = i
        df_m = run_mixture_cv(seed, dataset, fold=3)
        df_w = run_one_race_cv(seed, dataset_w, fold=3)
        df_w = df_w.rename(columns={"Auc": "W_ind"})
        df_b = run_one_race_cv(seed, dataset_b, **parametrs_b)
        df_b = df_b.rename(columns={"Auc": "B_ind"})
        df_tl = run_unsupervised_transfer_cv(seed, dataset, fold=3)

        df1 = pd.concat([df_m, df_w['W_ind'], df_b['B_ind'], df_tl['TL_Auc']],
                        sort=False,
                        axis=1)
        print(df1)
        res = res.append(df1)
    f_name = 'Result/MM-AA-EA-mRNA-OS-3YR.xlsx'
    res.to_excel(f_name)
Exemple #4
0
def run_cv(cancer_type, feature_type, target, years=3):

    print(cancer_type, feature_type, target, years)
    dataset = get_dataset(cancer_type=cancer_type,
                          feature_type=feature_type,
                          target=target,
                          groups=("WHITE", "BLACK"))
    # dataset = read_data(cancer_type, feature_type[0], target, years)
    dataset_w = get_one_race(dataset, 'WHITE')
    dataset_w = get_n_years(dataset_w, years)
    dataset_b = get_one_race(dataset, 'BLACK')
    dataset_b = get_n_years(dataset_b, years)

    dataset_tl = normalize_dataset(dataset)
    dataset_tl = get_n_years(dataset_tl, years)
    dataset = get_n_years(dataset, years)

    k = -1
    X, Y, R, y_sub, y_strat = dataset
    df = pd.DataFrame(y_strat, columns=['RY'])
    df['R'] = R
    df['Y'] = Y
    print(X.shape)
    print(df['RY'].value_counts())
    print(df['R'].value_counts())
    print(df['Y'].value_counts())

    parametrs_mix = {
        'fold': 3,
        'k': k,
        'val_size': 0.0,
        'batch_size': 20,
        'learning_rate': 0.01,
        'lr_decay': 0.0,
        'dropout': 0.5,
        'L1_reg': 0.001,
        'L2_reg': 0.001,
        'hiddenLayers': [128, 64]
    }
    parametrs_w = {
        'fold': 3,
        'k': k,
        'val_size': 0.0,
        'batch_size': 20,
        'learning_rate': 0.01,
        'lr_decay': 0.0,
        'dropout': 0.5,
        'L1_reg': 0.001,
        'L2_reg': 0.001,
        'hiddenLayers': [128, 64]
    }
    parametrs_b = {
        'fold': 3,
        'k': k,
        'val_size': 0.0,
        'batch_size': 4,
        'learning_rate': 0.01,
        'lr_decay': 0.0,
        'dropout': 0.5,
        'L1_reg': 0.001,
        'L2_reg': 0.001,
        'hiddenLayers': [128, 64]
    }
    parameters_CCSA = {
        'fold': 3,
        'n_features': k,
        'alpha': 0.3,
        'batch_size': 20,
        'learning_rate': 0.01,
        'hiddenLayers': [100],
        'dr': 0.0,
        'momentum': 0.9,
        'decay': 0.0,
        'sample_per_class': 2
    }

    res = pd.DataFrame()
    for i in range(20):
        seed = i
        df_m = run_mixture_cv(seed, dataset, **parametrs_mix)
        df_w = run_one_race_cv(seed, dataset_w, **parametrs_w)
        df_w = df_w.rename(columns={"Auc": "W_ind"})
        df_b = run_one_race_cv(seed, dataset_b, **parametrs_b)
        df_b = df_b.rename(columns={"Auc": "B_ind"})
        df_tl = run_CCSA_transfer(seed, dataset_tl, **parameters_CCSA)

        df1 = pd.concat([df_m, df_w['W_ind'], df_b['B_ind'], df_tl['TL_Auc']],
                        sort=False,
                        axis=1)
        print(df1)
        res = res.append(df1)

    f_name = 'Result/' + cancer_type + '-AA-EA-' + feature_type[
        0] + '-' + target + '-' + str(years) + 'YR.xlsx'
    res.to_excel(f_name)
Exemple #5
0
def run_cv(cancer_type,
           feature_type,
           target,
           years=3,
           groups=("WHITE", "BLACK")):

    print(cancer_type, feature_type, target, years)
    # dataset = get_dataset_integ(cancer_type=cancer_type, feature_type=feature_type, target=target, groups=groups)
    dataset = get_dataset(cancer_type=cancer_type,
                          feature_type=feature_type,
                          target=target,
                          groups=groups)
    if dataset['X'].shape[0] < 10: return None
    dataset = standarize_dataset(dataset)
    dataset_w = get_one_race(dataset, 'WHITE')
    if dataset_w['X'].shape[0] < 5: return None
    dataset_w = get_n_years(dataset_w, years)
    dataset_b = get_one_race(dataset, 'BLACK')
    if dataset_b['X'].shape[0] < 5: return None
    dataset_b = get_n_years(dataset_b, years)

    dataset_tl = normalize_dataset(dataset)
    dataset_tl = get_n_years(dataset_tl, years)

    dataset = get_n_years(dataset, years)
    k = 200 if 'mRNA' in feature_type or 'methylation' in feature_type else -1

    # print(numpy.count_nonzero(numpy.isnan(dataset['X'])))
    X, Y, R, y_sub, y_strat = dataset
    df = pd.DataFrame(y_strat, columns=['RY'])
    df['R'] = R
    df['Y'] = Y
    print(X.shape)
    Dict = df['RY'].value_counts()
    print(Dict)
    if len(Dict) < 4: return None
    Dict = dict(Dict)
    print(Dict)
    for key in Dict:
        print(key, Dict[key])
        if Dict[key] < 5:
            return None

    parametrs_mix = {
        'fold': 3,
        'k': k,
        'val_size': 0.0,
        'batch_size': 20,
        'momentum': 0.9,
        'learning_rate': 0.01,
        'lr_decay': 0.03,
        'dropout': 0.5,
        'L1_reg': 0.001,
        'L2_reg': 0.001,
        'hiddenLayers': [128, 64]
    }
    parametrs_w = {
        'fold': 3,
        'k': k,
        'val_size': 0.0,
        'batch_size': 20,
        'learning_rate': 0.01,
        'lr_decay': 0.0,
        'dropout': 0.5,
        'L1_reg': 0.001,
        'L2_reg': 0.001,
        'hiddenLayers': [128, 64]
    }
    parametrs_b = {
        'fold': 3,
        'k': k,
        'val_size': 0.0,
        'batch_size': 4,
        'learning_rate': 0.01,
        'lr_decay': 0.0,
        'dropout': 0.5,
        'L1_reg': 0.001,
        'L2_reg': 0.001,
        'hiddenLayers': [128, 64]
    }

    parametrs_tl = {
        'fold': 3,
        'k': k,
        'val_size': 0.0,
        'batch_size': 20,
        'tune_epoch': 500,
        'learning_rate': 0.01,
        'lr_decay': 0.03,
        'dropout': 0.5,
        'tune_lr': 0.002,
        'L1_reg': 0.001,
        'L2_reg': 0.001,
        'hiddenLayers': [128, 64],
        'tune_batch': 10
    }

    parametrs_tl_unsupervised = {
        'fold': 3,
        'k': k,
        'val_size': 0.0,
        'batch_size': 20,
        'learning_rate': 0.001,
        'lr_decay': 0.03,
        'dropout': 0.0,
        'n_epochs': 100,
        'L1_reg': 0.001,
        'L2_reg': 0.001,
        'hiddenLayers': [100]
    }

    # parametrs_tl_sa = {'fold': 3, 'k': k, 'val_size':0.0, 'batch_size':20,
    #                  'learning_rate':0.005, 'lr_decay':0.0, 'dropout':0.5,
    #                  'L1_reg': 0.001, 'L2_reg': 0.001, 'hiddenLayers': [128, 64]}

    parameters_CCSA = {
        'fold': 3,
        'n_features': k,
        'alpha': 0.3,
        'batch_size': 32,
        'learning_rate': 0.01,
        'hiddenLayers': [100],
        'dr': 0.0,
        'momentum': 0.0,
        'decay': 0.0,
        'sample_per_class': 2
    }

    res = pd.DataFrame()
    for i in range(20):
        seed = i
        df_m = run_mixture_cv(seed, dataset, **parametrs_mix)
        df_w = run_one_race_cv(seed, dataset_w, **parametrs_w)
        df_w = df_w.rename(columns={"Auc": "W_ind"})
        df_b = run_one_race_cv(seed, dataset_b, **parametrs_b)
        df_b = df_b.rename(columns={"Auc": "B_ind"})
        df_tl_supervised = run_supervised_transfer_cv(seed, dataset,
                                                      **parametrs_tl)
        df_tl_supervised = df_tl_supervised.rename(columns={"TL_Auc": "XY_TL"})

        df_tl_unsupervised = run_unsupervised_transfer_cv(
            seed, dataset, **parametrs_tl_unsupervised)
        df_tl_unsupervised = df_tl_unsupervised.rename(
            columns={"TL_Auc": "X_TL"})

        df_tl = run_CCSA_transfer(seed, dataset_tl, **parameters_CCSA)
        df_tl = df_tl.rename(columns={"TL_Auc": "CCSA_TL"})

        df1 = pd.concat(
            [
                df_m,
                df_w['W_ind'],
                df_b['B_ind'],
                df_tl['CCSA_TL'],
                # df_tl_unsupervised['X_TL'],
                df_tl_supervised['XY_TL']
            ],
            sort=False,
            axis=1)

        res = res.append(df1)

    print(res)
    res['cancer_type'] = cancer_type
    res['feature_type'] = '-'.join(feature_type)
    res['target'] = target
    res['years'] = years
    return res