Esempio n. 1
0
def prepare_data(fcs,
                 marker_names,
                 quant_normed,
                 arcsinh,
                 cofactor,
                 indir,
                 seed,
                 train_perc,
                 regression,
                 per_sample,
                 n_splits=None):
    # read in the data
    fcs_info = np.array(pd.read_csv(fcs, sep=','))

    # if the samples have already been pre-processed via quantile normalization
    # we should not perform arcsinh transformation
    if quant_normed:
        arcsinh = False

    samples, phenotypes = get_data(indir, fcs_info, marker_names, arcsinh,
                                   cofactor)

    # generate training/validation sets
    np.random.seed(seed)
    val_perc = 1 - train_perc
    if n_splits is None:
        n_splits = int(1. / val_perc)
    # stratified CV for classification problems
    if not regression:
        skf = StratifiedKFold(n_splits=n_splits, shuffle=True)
    # simple CV for regression problems
    else:
        skf = KFold(n_splits=n_splits, shuffle=True)

    train, val = next(skf.split(np.zeros((len(phenotypes), 1)), phenotypes))
    #train = range(len(phenotypes))
    #val = None

    train_samples = [samples[i] for i in train]
    #valid_samples = [samples[i] for i in val] if val is not None else None
    valid_samples = [samples[i] for i in val]
    train_phenotypes = [phenotypes[i] for i in train]
    #valid_phenotypes = [phenotypes[i] for i in val] if val is not None else None
    valid_phenotypes = [phenotypes[i] for i in val]
    #
    #     print '\nSamples used for model training:'
    #     for i in train:
    #         print fcs_info[i]
    #     print '\nSamples used for validation:'
    #     for i in val:
    #         print fcs_info[i]
    #     print

    # always generate multi-cell inputs on a per-sample basis for regression
    if regression:
        per_sample = True

    return train_samples, train_phenotypes, valid_samples, valid_phenotypes, marker_names, fcs_info, samples
Esempio n. 2
0
def main():
    parser = argparse.ArgumentParser()
    # IO-specific
    parser.add_argument('-f', '--fcs', required=True,
                        help='file specifying the FCS file names and corresponding labels')
    parser.add_argument('-m', '--markers', required=True,
                        help='file specifying the names of markers to be used for analysis')
    parser.add_argument('-i', '--indir', default='./',
                        help='directory where input FCS files are located')
    parser.add_argument('-o', '--outdir', default='output',
                        help='directory where output will be generated')
    parser.add_argument('-p', '--plot', action='store_true', default=True,
                        help='whether to plot results ')
    parser.add_argument('--export_selected_cells', action='store_true', default=False,
                        help='whether to export selected cell populations')
    parser.add_argument('--export_csv', action='store_true', default=False,
                        help='whether to export network weights as csv files')
    parser.add_argument('-l', '--load_results', action='store_true', default=False,
                        help='whether to load precomputed results')

    # data preprocessing
    parser.add_argument('--train_perc', type=float, default=0.75,
                        help='percentage of samples to be used for training')
    parser.add_argument('--arcsinh', dest='arcsinh', action='store_true',
                        help='preprocess the data with arcsinh')
    parser.add_argument('--no_arcsinh', dest='arcsinh', action='store_false',
                        help='do not preprocess the data with arcsinh')
    parser.set_defaults(arcsinh=True)
    parser.add_argument('--cofactor', type=int, default=5,
                        help='cofactor for the arcsinh transform')
    parser.add_argument('--scale', dest='scale', action='store_true',
                        help='z-transform features (mean=0, std=1) prior to training')
    parser.add_argument('--no_scale', dest='scale', action='store_false',
                        help='do not z-transform features (mean=0, std=1) prior to training')
    parser.set_defaults(scale=True)
    parser.add_argument('--quant_normed', action='store_true', default=False,
                        help='input data has been pre-processed via quantile normalization')

    # multi-cell input specific
    parser.add_argument('--ncell', type=int, help='number of cells per multi-cell input',
                        default=200)
    parser.add_argument('--nsubset', type=int, help='number of multi-cell inputs',
                        default=1000)
    parser.add_argument('--per_sample', action='store_true', default=False,
                        help='whether nsubset refers to each class or each sample')
    parser.add_argument('--subset_selection', choices=['random', 'outlier'], default='random',
                        help='generate random or outlier-enriched multi-cell inputs')

    # neural network specific
    parser.add_argument('--maxpool_percentages', nargs='+', type=float,
                        help='list of choices (percentage of multi-cell input) for top-k max pooling',
                        default=[0.01, 1, 5, 20, 100])
    parser.add_argument('--nfilter_choice', nargs='+', type=int,
                        help='list of choices for number of filters', default=range(3, 10))
    parser.add_argument('--learning_rate', type=float, default=0.005,
                        help='learning rate for the Adam optimization algorithm')
    parser.add_argument('--coeff_l1', type=float, default=0,
                        help='coefficient for L1 weight regularization')
    parser.add_argument('--coeff_l2', type=float, default=0.0001,
                        help='coefficient for L2 weight regularization')
    parser.add_argument('--coeff_activity', type=float, default=0,
                        help='coefficient for regularizing the activity at each filter')
    parser.add_argument('--max_epochs', type=int, default=20,
                        help='maximum number of iterations through the data')
    parser.add_argument('--patience', type=int, default=5,
                        help='number of epochs before early stopping')

    # analysis specific
    parser.add_argument('--seed', type=int, default=1234,
                        help='random seed')
    parser.add_argument('--nrun', type=int, default=15,
                        help='number of neural network configurations to try (should be >= 3)')
    parser.add_argument('--regression', action='store_true', default=False,
                        help='whether it is a regression problem (default is classification)')
    parser.add_argument('--dendrogram_cutoff', type=float, default=.4,
                        help='cutoff for hierarchical clustering of filter weights')
    parser.add_argument('--accur_thres', type=float, default=.9,
                        help='keep filters from models achieving at least this accuracy ' \
                             ' (or at least from the best 3 models)')
    parser.add_argument('-v', '--verbose', type=int, choices=[0, 1], default=1,
                        help='output verbosity')

    # plot specific
    parser.add_argument('--filter_diff_thres', type=float, default=0.2,
                        help='threshold that defines which filters are discriminative')
    parser.add_argument('--filter_response_thres', type=float, default=0,
                        help='threshold that defines the selected cell population per filter')
    parser.add_argument('--positive_filters_only', action='store_true', default=False,
                        help='whether to only consider filters associated with higher cell ' \
                             'population frequencies in the positive class')
    parser.add_argument('--stat_test', choices=[None, 'ttest', 'mannwhitneyu'],
                        help='statistical test for comparing cell population frequencies of two ' \
                             'groups of samples')
    parser.add_argument('--group_a', default='group A',
                        help='name of the first class')
    parser.add_argument('--group_b', default='group B',
                        help='name of the second class')
    args = parser.parse_args()

    # read in the data
    fcs_info = np.array(pd.read_csv(args.fcs, sep=','))
    marker_names = list(pd.read_csv(args.markers, sep=',').columns)
    # if the samples have already been pre-processed via quantile normalization
    # we should not perform arcsinh transformation
    if args.quant_normed:
        args.arcsinh = False
    samples, phenotypes = get_data(args.indir, fcs_info, marker_names,
                                   args.arcsinh, args.cofactor)

    if not args.load_results:
        # generate training/validation sets
        np.random.seed(args.seed)
        val_perc = 1 - args.train_perc
        n_splits = int(1. / val_perc)
        skf = StratifiedKFold(n_splits=n_splits, shuffle=True)
        train, val = next(skf.split(np.zeros((len(phenotypes), 1)), phenotypes))
        train_samples = [samples[i] for i in train]
        valid_samples = [samples[i] for i in val]
        train_phenotypes = [phenotypes[i] for i in train]
        valid_phenotypes = [phenotypes[i] for i in val]

        # run CellCnn
        model = CellCnn(ncell=args.ncell,
                        nsubset=args.nsubset,
                        per_sample=args.per_sample,
                        subset_selection=args.subset_selection,
                        scale=args.scale,
                        quant_normed=args.quant_normed,
                        maxpool_percentages=args.maxpool_percentages,
                        nfilter_choice=args.nfilter_choice,
                        nrun=args.nrun,
                        regression=args.regression,
                        learning_rate=args.learning_rate,
                        coeff_l1=args.coeff_l1,
                        coeff_l2=args.coeff_l2,
                        coeff_activity=args.coeff_activity,
                        max_epochs=args.max_epochs,
                        patience=args.patience,
                        dendrogram_cutoff=args.dendrogram_cutoff,
                        accur_thres=args.accur_thres,
                        verbose=args.verbose)
        model.fit(train_samples=train_samples, train_phenotypes=train_phenotypes,
                  valid_samples=valid_samples, valid_phenotypes=valid_phenotypes,
                  outdir=args.outdir)
        # save results for subsequent analysis
        results = model.results
        pickle.dump(results, open(os.path.join(args.outdir, 'results.pkl'), 'w'))
    else:
        results = pickle.load(open(os.path.join(args.outdir, 'results.pkl'), 'r'))

    if args.export_csv:
        save_results(results, args.outdir, marker_names)

    # plot results
    if args.plot or args.export_selected_cells:
        mkdir_p(os.path.join(args.outdir, 'plots'))
        filter_info = plot_results(results, samples, phenotypes,
                                   marker_names, os.path.join(args.outdir, 'plots'),
                                   filter_diff_thres=args.filter_diff_thres,
                                   filter_response_thres=args.filter_response_thres,
                                   positive_filters_only=args.positive_filters_only,
                                   stat_test=args.stat_test,
                                   group_a=args.group_a, group_b=args.group_b)
        if args.export_selected_cells:
            csv_dir = os.path.join(args.outdir, 'selected_cells')
            mkdir_p(csv_dir)
            nfilter = len(filter_info)
            sample_names = [name.split('.fcs')[0] for name in list(fcs_info[:, 0])]
            # for each sample
            for x, x_name in zip(samples, sample_names):
                flags = np.zeros((x.shape[0], 2*nfilter))
                columns = []
                # for each filter
                for i, (filter_idx, thres) in enumerate(filter_info):
                    flags[:, 2*i:2*(i+1)] = get_selected_cells(
                        results['selected_filters'][filter_idx], x, results['scaler'], thres, True)
                    columns += ['filter_%d_continuous' % filter_idx, 'filter_%d_binary' % filter_idx]
                df = pd.DataFrame(flags, columns=columns)
                df.to_csv(os.path.join(csv_dir, x_name+'_selected_cells.csv'), index=False)