コード例 #1
0
def random_select_subset(args):
    import h5py
    import numpy as np
    from utils import prepare_output_file

    if ':' not in args.input_file:
        raise ValueError('missing group name in input file: ' +
                         args.input_file)
    logger.info('read input file: ' + args.input_file)
    input_file, group_name = args.input_file.split(':')
    with h5py.File(input_file, 'r') as f:
        X = f['/%s/X' % group_name][:]
        chrom = f['/%s/chrom' % group_name][:]
        positions = f['/%s/position' % group_name][:]

    logger.info('select %d SNP subsets of size %d' %
                (args.n_groups, args.n_snps))
    logger.info('create output file: ' + args.output_file)
    prepare_output_file(args.output_file)
    with h5py.File(args.output_file, 'w') as f:
        if args.method == 'random_choice':
            for i in range(args.n_groups):
                ind = np.random.choice(X.shape[0],
                                       size=args.n_snps,
                                       replace=False)
                f.create_dataset('/%d/X' % i, data=X[ind])
                f.create_dataset('/%d/chrom' % i, data=chrom[ind])
                f.create_dataset('/%d/position' % i, data=positions[ind])
        elif args.method == 'seq':
            for i in range(X.shape[0] / args.n_snps):
                ind = np.r_[(i * args.n_snps):((i + 1) * args.n_snps)]
                f.create_dataset('/%d/X' % i, data=X[ind])
                f.create_dataset('/%d/chrom' % i, data=chrom[ind])
                f.create_dataset('/%d/position' % i, data=positions[ind])
コード例 #2
0
ファイル: predict.py プロジェクト: ltbyshi/cardiacai
def classify_diseases(args):
    from keras.models import load_model
    from models import custom_objects
    import numpy as np
    import h5py
    globals().update(locals())

    logger.info('load model from file: ' + args.model_file)
    model = load_model(args.model_file, custom_objects=custom_objects)

    logger.info('read image ids from file: ' + args.image_id_file)
    image_id = read_hdf5(args.image_id_file, args.image_id_dataset)

    logger.info('read image data from file: ' + args.input_file)
    fin = h5py.File(args.input_file, 'r')
    X = fin['X'][:]
    image_id_X = fin['image_id'][:]
    fin.close()
    if model.input.shape[3] > 1:
        logger.info('convert gray-scale images to 3-channel images')
        X = np.repeat(X[array_lookup(image_id_X, image_id)], 3, axis=3)
    else:
        X = np.take(X, array_lookup(image_id_X, image_id), axis=0)

    logger.info('predict')
    y = model.predict(X, batch_size=args.batch_size)

    logger.info('create output file: ' + args.output_file)
    prepare_output_file(args.output_file)
    fout = h5py.File(args.output_file, 'w')
    fout.create_dataset('y', data=y)
    fout.create_dataset('image_id', data=image_id)
    fout.close()
コード例 #3
0
def random_cv_split(args):
    import numpy as np
    import h5py
    from utils import read_hdf5_single, cv_split_emaize, get_indices_table, prepare_output_file, read_hdf5_dataset

    logger.info('read training indices file: ' + args.train_index_file)
    train_indices_all = read_hdf5_dataset(args.train_index_file)
    logger.info('read parent table file: ' + args.parent_table_file)
    parent_table = read_hdf5_single(args.parent_table_file)
    indices_table, mask = get_indices_table(train_indices_all, parent_table)

    logger.info('create output file: ' + args.output_file)
    prepare_output_file(args.output_file)
    with h5py.File(args.output_file, 'w') as f:
        for k in range(args.n_datasets):
            row_indices = np.random.choice(indices_table.shape[0],
                                           5,
                                           replace=False)
            col_indices = np.random.choice(indices_table.shape[1],
                                           5,
                                           replace=False)
            test_indices = np.union1d(
                indices_table[row_indices, :].reshape((-1, )),
                indices_table[:, col_indices].reshape((-1, )))
            train_indices = np.setdiff1d(train_indices_all, test_indices)
            test_indices = np.intersect1d(test_indices, train_indices_all)
            train_indices = np.intersect1d(train_indices, train_indices_all)
            g = f.create_group(str(k))
            g.create_dataset('train', data=train_indices)
            g.create_dataset('test', data=test_indices)
コード例 #4
0
def plot_model(args):
    import keras

    logger.info('load model from file: ' + args.model_file)
    model = keras.models.load_model(args.model_file)
    logger.info('save plot model file: ' + args.output_file)
    prepare_output_file(args.output_file)
    keras.utils.plot_model(model, args.output_file, show_shapes=True)
コード例 #5
0
def run_generate_parent_table(args):
    from utils import generate_parent_table, prepare_output_file
    import numpy as np
    import h5py

    logger.info('read phenotypes from file: ' + args.input_file)
    parent_table = generate_parent_table(args.input_file)
    logger.info('save parent table to file: ' + args.output_file)
    prepare_output_file(args.output_file)
    with h5py.File(args.output_file, 'w') as f:
        f.create_dataset('data', data=parent_table)
コード例 #6
0
def random_select(args):
    import h5py
    import numpy as np
    from utils import prepare_output_file

    logger.info('read genomic positions from file: ' + args.genomic_pos_file)
    positions = {}
    with h5py.File(args.genomic_pos_file, 'r') as f:
        for i in range(1, 11):
            positions['chr%d' % i] = f['chr%d' % i][:]
    n_snps_per_chrom = {
        chrom: positions[chrom].shape[0]
        for chrom in positions.keys()
    }
    n_snps_total = sum(n_snps_per_chrom.values())

    X = [[] for i in range(args.n_select)]
    chroms = [[] for i in range(args.n_select)]
    positions_sel = [[] for i in range(args.n_select)]
    n_sel_total = 0
    for i_chrom in range(1, 11):
        chrom = 'chr%d' % i_chrom
        genotype_file = os.path.join(args.input_dir, chrom)
        logger.info('read genotype file: ' + genotype_file)
        with h5py.File(genotype_file, 'r') as f:
            n_sel = int(
                np.round(args.n_snps * float(n_snps_per_chrom[chrom]) /
                         n_snps_total))
            logger.info('select %d SNPs on chromosome %d' % (n_sel, i_chrom))
            X_chrom = f['data'][:]
            for i_select in range(args.n_select):
                ind = np.random.choice(n_snps_per_chrom[chrom],
                                       size=n_sel,
                                       replace=False)
                X[i_select].append(X_chrom[ind])
                chroms[i_select].append(np.full(n_sel, i_chrom, dtype='int8'))
                positions_sel[i_select].append(positions[chrom][ind])
            del X_chrom
            n_sel_total += n_sel
    logger.info('number of SNPs selected: %d' % n_sel_total)

    logger.info('create output file: ' + args.output_file)
    prepare_output_file(args.output_file)
    fout = h5py.File(args.output_file, 'w')
    for i_select in range(args.n_select):
        X[i_select] = np.concatenate(X[i_select], axis=0)
        chroms[i_select] = np.concatenate(chroms[i_select])
        positions_sel[i_select] = np.concatenate(positions_sel[i_select])
        g = fout.create_group(str(i_select))
        g.create_dataset('X', data=X[i_select])
        g.create_dataset('chrom', data=chroms[i_select])
        g.create_dataset('position', data=positions_sel[i_select])
    fout.close()
コード例 #7
0
def phenotypes_to_hdf5(args):
    import pandas as pd
    import h5py
    from utils import prepare_output_file

    logger.info('read phenotype file: ' + args.input_file)
    phenotypes = pd.read_table(args.input_file)
    logger.info('create output file: ' + args.output_file)
    prepare_output_file(args.output_file)
    with h5py.File(args.output_file, 'w') as f:
        for col in phenotypes.columns:
            if phenotypes[col].dtype == 'O':
                f.create_dataset(col, data=phenotypes[col].values.astype('S'))
            else:
                f.create_dataset(col, data=phenotypes[col].values)
コード例 #8
0
def convert_train_test_indices(args):
    import numpy as np
    import h5py
    from utils import prepare_output_file

    logger.info('read training sample indices from file: ' +
                args.train_index_file)
    train_index = np.loadtxt(args.train_index_file, dtype='int')
    logger.info('read training sample indices from file: ' +
                args.test_index_file)
    test_index = np.loadtxt(args.test_index_file, dtype='int')
    logger.info('create output file: ' + args.output_file)
    prepare_output_file(args.output_file)
    with h5py.File(args.output_file, 'w') as f:
        f.create_dataset('train', data=train_index)
        f.create_dataset('test', data=test_index)
コード例 #9
0
def phenotypes_to_train_test_indices(args):
    import pandas as pd
    import numpy as np
    import h5py
    from utils import prepare_output_file

    logger.info('read input file: ' + args.input_file)
    phenotypes = pd.read_table(args.input_file)
    logger.info('create output file: ' + args.output_file)
    prepare_output_file(args.output_file)
    with h5py.File(args.output_file, 'w') as f:
        f.create_dataset('train',
                         data=np.nonzero(
                             (phenotypes['type'] == 'training').values)[0])
        f.create_dataset('test',
                         data=np.nonzero(
                             (phenotypes['type'] == 'test').values)[0])
コード例 #10
0
def normalize_genotypes(args):
    from utils import read_hdf5_dataset, prepare_output_file
    import numpy as np
    import h5py

    logger.info('read input file: ' + args.input_file)
    X, dataset = read_hdf5_dataset(args.input_file, return_name=True)
    n_snps, n_samples = X.shape
    # allele frequencies
    p = X.sum(axis=1).astype('float32')/n_samples
    multiplier = 1.0/np.sqrt(2.0*p*(1.0 - p))
    multiplier = multiplier.astype('float32')
    logger.info('save mean and multipliers to output file: ' + args.output_file)
    prepare_output_file(args.output_file)
    with h5py.File(args.output_file, 'w') as f:
        f.create_dataset('mean', data=p)
        f.create_dataset('multiplier', data=multiplier)
コード例 #11
0
def extract_snp_pos(args):
    import h5py
    import subprocess
    import numpy as np
    from utils import prepare_output_file

    logger.info('create output file: ' + args.output_file)
    prepare_output_file(args.output_file)
    fout = h5py.File(args.output_file, 'w')
    for i in range(1, 11):
        genotype_file = os.path.join(args.input_dir,
                                     'chr%d_emaize.genoMat' % i)
        logger.info('read genotype matrix file: ' + genotype_file)
        p = subprocess.Popen(['awk', 'NR>1{print $4}', genotype_file],
                             stdout=subprocess.PIPE)
        positions = np.loadtxt(p.stdout, dtype='int64')
        fout.create_dataset('chr%d' % i, data=positions)
    fout.close()
コード例 #12
0
def create_gsm(args):
    import h5py
    import numpy as np
    from utils import prepare_output_file, read_hdf5_dataset
    '''
    logger.info('read genomic positions from file: ' + args.genomic_pos_file)
    positions = {}
    with h5py.File(args.genomic_pos_file, 'r') as f:
        for i in range(1, 11):
            positions['chr%d'%i] = f['chr%d'%i][:]
    n_snps_per_chrom = {chrom:positions[chrom].shape[0] for chrom in positions.keys()}
    n_snps_total = sum(n_snps_per_chrom.values())
    X = []
    for chrom in positions.keys():
        genotype_file = os.path.join(args.input_dir, chrom)
        logger.info('read genotype file: ' + genotype_file)
        with h5py.File(genotype_file, 'r') as f:
            n_sel = int(np.round(args.n_snps*float(n_snps_per_chrom[chrom])/n_snps_total))
            ind = np.random.choice(n_snps_per_chrom[chrom], size=n_sel)
            X.append(f['data'][:][ind])
    X = np.concatenate(X, axis=0).astype('float32')
    '''
    logger.info('read genotypes from file: ' + args.input_file)
    X = read_hdf5_dataset(args.input_file).astype('float64')
    logger.info('number of selected SNPs: %d' % X.shape[0])
    logger.info('calculate GSM')
    X -= X.mean(axis=1)[:, np.newaxis]
    X_std = np.sqrt(np.sum(X**2, axis=1))
    X_std[np.isclose(X_std, 0.0)] = 1.0
    X = X / X_std[:, np.newaxis]
    logger.info('calculate K')
    K = np.dot(X.T, X)
    logger.info('run SVD on X')
    U, S, V = np.linalg.svd(X.T, full_matrices=False)
    V = V.T
    logger.info('save GSM to file: ' + args.output_file)
    prepare_output_file(args.output_file)
    with h5py.File(args.output_file, 'w') as f:
        f.create_dataset('K', data=K)
        f.create_dataset('U', data=U)
        f.create_dataset('S', data=S)
        f.create_dataset('V', data=V)
コード例 #13
0
def convert_2bit_to_minor(args):
    from utils import read_hdf5_dataset, prepare_output_file
    import numpy as np
    import h5py
    import numba

    @numba.jit(nopython=True)
    def _2bit_to_minor(X_2bit, X_minor):
        n_snps = X_minor.shape[0]
        n_samples = X_minor.shape[1]
        max_freq = n_samples
        for i in range(n_snps):
            freq = 0
            for j in range(n_samples):
                count = X_2bit[i, 1, j] - X_2bit[i, 0, j] + 1
                freq += count
                X_minor[i, j] = count
            if freq > n_samples:
                for j in range(n_samples):
                    X_minor[i, j] = 2 - X_minor[i, j]

    logger.info('read input file: ' + args.input_file)
    X_2bit, dataset = read_hdf5_dataset(args.input_file, return_name=True)
    n_snps, n_samples = X_2bit.shape
    n_snps /= 2
    logger.info('number of SNPs: %d, number of samples: %d'%(n_snps, n_samples))
    X_2bit = X_2bit.reshape((n_snps, 2, n_samples))
    logger.info('convert from 2bit code to minor copy numbers')
    # assume that the second allele in the 2bit representation is the minor allele
    # 10 -> 0, 11 -> 1, 01 -> 2
    #X_minor = np.einsum('ijk,j->ik', X_2bit, np.array([-1, 1])) + 1
    # swap the two alleles to make sure that the number represents a minor allele
    #allele_freq = np.expand_dims(np.sum(X_2bit[:, 1, :], axis=1), axis=1)
    #X_minor = np.where(allele_freq <= n_samples/2, X_minor, 2 - X_minor)
    X_minor = np.empty((n_snps, n_samples), dtype='int8')
    _2bit_to_minor(X_2bit, X_minor)
    logger.info('save minor allele copy numbers to output file: ' + args.output_file)
    prepare_output_file(args.output_file)
    with h5py.File(args.output_file, 'w') as f:
        f.create_dataset(dataset, data=X_minor)
コード例 #14
0
def analyze_relevance(args):
    import keras
    from utils import read_hdf5, array_lookup

    logger.info('load model from file: ' + args.model_file)
    model = keras.models.load_model(args.model_file)
    logger.info('read image id file: ' + args.image_id_file)
    image_id = read_hdf5(args.image_id_file, args.image_id_dataset)
    logger.info('read input file: ' + args.input_file)
    X, image_id_X = read_hdf5(args.input_file,
                              [args.input_dataset, 'image_id'])
    if model.input.shape[3] > 1:
        logger.info('convert gray-scale images to 3-channel images')
        X = np.repeat(X[array_lookup(image_id_X, image_id)], 3, axis=3)
    else:
        X = np.take(X, array_lookup(image_id_X, image_id), axis=0)
    logger.info('read target file: ' + args.target_file)
    y, image_id_y = read_hdf5(args.target_file,
                              [args.target_dataset, 'image_id'])
    y = np.take(y, array_lookup(image_id_y, image_id), axis=0)
    if len(y.shape) == 1:
        y = np.expand_dims(y, axis=1)

    if args.method == 'deep_taylor':
        logger.info('input tensor name: ' + model.input.name.split(':')[0])
        logger.info('output tensor name: ' + model.output.name.split(':')[0])
        logger.info('start Deep Taylor decomposition')
        relevance_maps = deep_taylor(model, X, y)
    elif args.method == 'sensitivity':
        logger.info('start sensitivity analysis')
        relevance_maps = sensitivity_analysis(model, X, y, args.batch_size)

    logger.info('save relevance maps to file: ' + args.output_file)
    prepare_output_file(args.output_file)
    fout = h5py.File(args.output_file, 'w')
    fout.create_dataset('X', data=X)
    fout.create_dataset('relevance_map', data=relevance_maps)
    fout.create_dataset('image_id', data=image_id)
    fout.close()
コード例 #15
0
def evaluate(args):
    import h5py
    from sklearn.metrics import r2_score, mean_squared_error
    from scipy.stats import pearsonr
    from utils import prepare_output_file, read_hdf5_dataset

    logger.info('read prediction file: ' + args.input_file)
    with h5py.File(args.input_file, 'r') as f:
        y_true = f['y_true'][:]
        y_pred = f['y_pred'][:]
    logger.info('read sample indices file: ' + args.sample_indices_file)
    indices = read_hdf5_dataset(args.sample_indices_file)
    y_true = y_true[indices]
    y_pred = y_pred[indices]

    logger.info('save metrics file: ' + args.output_file)
    prepare_output_file(args.output_file)
    with open(args.output_file, 'w') as f:
        f.write('r2\tmse\tpcc\n')
        f.write('%f' % r2_score(y_true, y_pred))
        f.write('\t%f' % mean_squared_error(y_true, y_pred))
        f.write('\t%f' % pearsonr(y_true, y_pred)[0])
        f.write('\n')
コード例 #16
0
        kfold_male = KFold(args.k_male, shuffle=True)
        for train_index_female, _ in kfold_female.split(ind_female_training):
            train_index_female = ind_female_training[train_index_female]
            for train_index_male, _ in kfold_male.split(ind_male_training):
                train_index_male = ind_male_training[train_index_male]
                parent_table_train = np.ravel(
                    parent_table[:, train_index_female][train_index_male])
                train_index.append(
                    parent_table_train[is_training[parent_table_train]])
                parent_table_test = np.setdiff1d(
                    np.ravel(parent_table_training), parent_table_train)
                test_index.append(
                    parent_table_test[is_training[parent_table_test]])

    import h5py
    logger.info('create output file: ' + args.output_file)
    prepare_output_file(args.output_file)
    with h5py.File(args.output_file, 'w') as fout:
        if args.max_size is not None:
            if len(train_index) > args.max_size:
                logger.info('randomly sample %d sets from %s sets' %
                            (args.max_size, len(train_index)))
                sel = np.random.choice(len(train_index),
                                       size=args.max_size,
                                       replace=False)
                train_index = [train_index[i] for i in sel]
                test_index = [test_index[i] for i in sel]
        for i in range(len(train_index)):
            g = fout.create_group(str(i))
            g.create_dataset('train', data=train_index[i])
            g.create_dataset('test', data=test_index[i])
コード例 #17
0
ファイル: run_fastlmm.py プロジェクト: caudjcc/emaize-1
def run_fastlmm(args):
    from pysnptools.snpreader import SnpData, Pheno, SnpReader
    from utils import prepare_output_file, read_cvindex
    from fastlmm.inference import FastLMM
    import dill as pickle

    logger.info('read phenotypes from file: ' + args.phenotype_file)
    phenotypes = pd.read_table(args.phenotype_file)
    iid = np.repeat(phenotypes['id'].values.astype('S')[:, np.newaxis],
                    2,
                    axis=1)
    if args.cvindex_file is not None:
        logger.info('read indices from file: ' + args.cvindex_file)
        train_index, test_index = read_cvindex(args.cvindex_file)
    else:
        train_index = np.nonzero((phenotypes['type'] == 'training').values)[0]
        test_index = np.nonzero((phenotypes['type'] == 'test').values)[0]

    n_snps_total = get_num_snps(args.snp_file)
    n_snps_sel = min(n_snps_total, args.n_snps)
    logger.info('number of sampled SNPs: %d' % n_snps_sel)
    sel_snps = np.random.choice(n_snps_total, size=n_snps_sel)

    logger.info('read SNP file (for test): ' + args.snp_file)
    test_snps = get_snpdata(iid,
                            args.snp_file,
                            transpose=args.transpose_x,
                            snp_indices=sel_snps,
                            std_filter_indices=train_index)
    logger.info('number of sampled SNPs after filtering by std: %d' %
                test_snps.shape[1])
    logger.info('read SNP file (for K0): ' + args.k0_file)
    K0 = get_snpdata(iid, args.k0_file, transpose=args.transpose_k0)

    if args.seed:
        logger.info('set random seed for numpy: %d' % args.seed)
        np.seed(args.seed)

    if not os.path.exists(args.output_dir):
        os.makedirs(args.output_dir)
    df_pheno = phenotypes.copy()
    df_pheno['fid'] = df_pheno['id']
    df_pheno['iid'] = df_pheno['id']
    traits = ('trait1', 'trait2', 'trait3')
    for trait in traits:
        pheno_file = os.path.join(args.output_dir, 'pheno.%s.txt' % trait)
        logger.info('create Pheno file: ' + pheno_file)
        df_pheno.loc[train_index, ['fid', 'iid', trait]].to_csv(pheno_file,
                                                                index=False,
                                                                sep='\t',
                                                                header=False)
        pheno = Pheno(pheno_file)
        logger.info('train FastLMM model for %s' % trait)
        model = FastLMM(GB_goal=args.GB_goal, force_low_rank=True)
        model.fit(X=test_snps[train_index, :],
                  y=pheno,
                  K0_train=K0,
                  penalty=args.penalty,
                  Smin=1.0)
        logger.info('fitted h2: %f' % model.h2raw)
        logger.info('predict using the FastLMM model for %s' % trait)
        y_mean, y_var = model.predict(X=test_snps[test_index, :],
                                      K0_whole_test=K0[test_index, :])
        y_true = phenotypes[trait][test_index].values
        result_file = os.path.join(args.output_dir, 'predictions.%s' % trait)
        logger.info('save predictions to file: ' + result_file)
        prepare_output_file(result_file)
        with h5py.File(result_file, 'w') as f:
            f.create_dataset('y_mean', data=y_mean.val)
            f.create_dataset('y_var', data=y_var.val)
            f.create_dataset('y_true', data=y_true)
            f.create_dataset('h2raw', data=model.h2raw)
            f.create_dataset('sel_snps', data=sel_snps)

        model_file = os.path.join(args.output_dir, 'model.fastlmm.%s' % trait)
        logger.info('save model to file: ' + model_file)
        with open(model_file, 'wb') as f:
            pickle.dump(model, f)
コード例 #18
0
ファイル: report.py プロジェクト: ltbyshi/cardiacai
def summarize_cv(args):
    import pandas as pd
    columns = {}
    colnames = {}
    with h5py.File(os.path.join(args.input_dir, 'cv_split'), 'r') as f:
        n_folds = len(f.keys())
        cv_split = {}
        for cv_fold in f.keys():
            cv_split[int(cv_fold)] = {}
            g = f[cv_fold]
            for key in g.keys():
                cv_split[int(cv_fold)][key] = g[key][:]
    colnames['classification'] = []
    if args.task == 'classification':
        with h5py.File(os.path.join(args.input_dir, 'targets'), 'r') as f:
            class_name = f['class_name'][:]
            y = f['y'][:]
            image_id_y = f['image_id'][:]
        colnames['classification'] = ['train_size', 'test_size']
        colnames['classification'] += ['class_size(%s)'%(class_name[i]) for i in range(len(class_name))]
    columns['cv_fold'] = np.full(n_folds, -1, dtype='int32')
    colnames['metric'] = []
    for cv_fold in range(n_folds):
        cv_dir = os.path.join(args.input_dir, 'cv', str(cv_fold))
        columns['cv_fold'][cv_fold] = cv_fold
        if not os.path.isdir(cv_dir):
            continue
        pred_file = os.path.join(cv_dir, 'predictions')
        with h5py.File(pred_file, 'r') as f:
            g = f['metrics']
            # get column names
            if len(colnames['metric']) == 0 :
                colnames['metric'] = []
                for metric in g.keys():
                    if len(g[metric].shape) == 0:
                        colnames['metric'].append(metric)
                    elif metric == 'accuracy_by_class':
                        colnames['metric'] += ['%s(%s)'%(metric, class_name[i]) for i in range(g[metric].shape[0])]
                for metric in colnames['metric']:
                    columns[metric] = np.full(n_folds, np.nan, dtype='float64')
                if args.task == 'classification':
                    for colname in ['train_size', 'test_size']:
                        columns[colname] = np.zeros(n_folds, dtype='int32')
                    for i in range(len(class_name)):
                        columns['class_size(%s)'%(class_name[i])] = np.zeros(n_folds, dtype='int32')
            for metric in g.keys():
                if len(g[metric].shape) == 0:
                    columns[metric][cv_fold] = g[metric][()]
                elif metric == 'accuracy_by_class':
                    metric_vals = g[metric][:]
                    for i in range(g[metric].shape[0]):
                        columns['%s(%s)'%(metric, class_name[i])][cv_fold] = metric_vals[i]
            if args.task == 'classification':
                columns['train_size'][cv_fold] = cv_split[cv_fold]['train'].shape[0]
                columns['test_size'][cv_fold] = cv_split[cv_fold]['test'].shape[0]
                for i in range(len(class_name)):
                    y_test = y[array_lookup(image_id_y, cv_split[cv_fold]['test'])]
                    # one-hot coding for multi-class
                    if len(y_test.shape) > 1:
                        columns['class_size(%s)'%(class_name[i])][cv_fold] = np.sum(y_test[:, i])
                    # two-class
                    else:
                        columns['class_size(%s)' % (class_name[i])][cv_fold] = np.sum(y_test == i)
    summary = pd.DataFrame(columns)
    attribute_keys = []
    if args.attribute is not None:
        for a in args.attribute:
            if '=' not in a:
                raise ValueError('missing = in attribute: ' + a)
            ind = a.index('=')
            key = a[:ind].strip()
            val = a[(ind + 1):].strip()
            summary[key] = val
            attribute_keys.append(key)
    summary = summary[attribute_keys + ['cv_fold'] + colnames['classification'] + colnames['metric']]

    logger.info('create output file: ' + args.output_file)
    prepare_output_file(args.output_file)
    summary.to_csv(args.output_file, sep='\t', quoting=False, index=False)
コード例 #19
0
def plot_predictions(args):
    import h5py
    from utils import read_hdf5_single, prepare_output_file, read_hdf5_dataset
    import matplotlib
    matplotlib.use('Agg')
    import matplotlib.pyplot as plt
    from matplotlib.backends.backend_pdf import PdfPages
    plt.rcParams['font.family'] = 'Arial'
    plt.rcParams['font.size'] = 12
    plt.rcParams['legend.fontsize'] = 12
    import numpy as np

    def normalize_phenotype(x, range_pheno=4.0):
        return (np.clip(x, -range_pheno, range_pheno) +
                range_pheno) / 2.0 / range_pheno

    logger.info('read parent table file: ' + args.parent_table_file)
    parent_table = read_hdf5_single(args.parent_table_file)
    logger.info('read predictions from file: ' + args.input_file)
    with h5py.File(args.input_file, 'r') as f:
        y_true = f['y_true'][:]
        y_pred = f['y_pred'][:]
    logger.info('read training indices from file: ' + args.train_indices_file)
    train_index = read_hdf5_dataset(args.train_indices_file)
    logger.info('read test indices from file: ' + args.test_indices_file)
    test_index = read_hdf5_dataset(args.test_indices_file)

    y_pred_train = np.full(y_pred.shape, np.nan)
    y_pred_train[train_index] = y_pred[train_index]
    y_pred_test = np.full(y_pred.shape, np.nan)
    y_pred_test[test_index] = y_pred[test_index]

    logger.info('create output file: ' + args.output_file)
    prepare_output_file(args.output_file)
    with PdfPages(args.output_file) as pdf:
        fig, axes = plt.subplots(4, 1, figsize=(10, 8))
        axes[0].matshow(np.take(np.ravel(normalize_phenotype(y_true)),
                                parent_table),
                        cmap=plt.cm.RdBu_r)
        axes[0].set_title('True phenotypes')

        axes[1].matshow(np.take(np.ravel(normalize_phenotype(y_pred)),
                                parent_table),
                        cmap=plt.cm.RdBu_r)
        axes[1].set_title('Predicted phenotypes')

        axes[2].matshow(np.take(np.ravel(normalize_phenotype(y_pred_train)),
                                parent_table),
                        cmap=plt.cm.RdBu_r)
        axes[2].set_title('Predicted phenotypes (train)')

        axes[3].matshow(np.take(np.ravel(normalize_phenotype(y_pred_test)),
                                parent_table),
                        cmap=plt.cm.RdBu_r)
        axes[3].set_title('Predicted phenotypes (test)')

        plt.tight_layout()
        pdf.savefig(fig)

        plt.clf()
        fig, axes = plt.subplots(2, 3, figsize=(10, 6))
        axes[0, 0].hist(y_true[~np.isnan(y_true)], bins=50)
        axes[0, 0].set_title('True phenotypes')
        axes[0, 1].hist(y_true[train_index], bins=50)
        axes[0, 1].set_title('True phenotypes (train)')
        axes[0, 2].hist(y_true[test_index], bins=50)
        axes[0, 2].set_title('True phenotypes (test)')
        axes[1, 0].hist(y_pred, bins=50)
        axes[1, 0].set_title('Predicted phenotypes')
        axes[1, 1].hist(y_pred[train_index], bins=50)
        axes[1, 1].set_title('Predicted phenotypes (train)')
        axes[1, 2].hist(y_pred[test_index], bins=50)
        axes[1, 2].set_title('Predicted phenotypes (test)')
        for i in range(2):
            for j in range(3):
                axes[i, j].set_xlim(-5, 5)
        plt.tight_layout()
        pdf.savefig(fig)

        plt.clf()
        fig, axes = plt.subplots(1, 3, figsize=(10, 4))
        axes[0].scatter(y_true[~np.isnan(y_true)],
                        y_pred[~np.isnan(y_true)],
                        s=3)
        axes[0].set_xlabel('True phenotypes')
        axes[0].set_ylabel('Predicted phenotypes')
        axes[0].set_title('All samples')

        axes[1].scatter(y_true[train_index], y_pred[train_index], s=3)
        axes[1].set_xlabel('True phenotypes')
        axes[1].set_ylabel('Predicted phenotypes')
        axes[1].set_title('Training samples')

        axes[2].scatter(y_true[test_index], y_pred[test_index], s=3)
        axes[2].set_xlabel('True phenotypes')
        axes[2].set_ylabel('Predicted phenotypes')
        axes[2].set_title('Training samples')

        plt.tight_layout()
        pdf.savefig(fig)