Beispiel #1
0
def preserve_original_files(args):
    '''
    Save original files in sub-directory; we'll be rewriting them
    in "prepare_files()".  (This function should be called once.)
    '''
    orig_dir = os.path.join(args.working_dir, 'original')
    utils.safe_mkdir(orig_dir)
    file_prefix_list = ['ensembl', 'experiment']
    for file_prefix in file_prefix_list:
        filebase = 'pruned_%s.csv' % file_prefix
        filename_old = os.path.join(args.working_dir, filebase)
        filename_new = os.path.join(orig_dir, filebase)
        os.rename(filename_old, filename_new)
    os.rename(os.path.join(args.results_dir, 'uncorrected.csv'),
              os.path.join(orig_dir, 'uncorrected.csv'))
Beispiel #2
0
            best_c = c

    (proposed_population, proposed_SNP_list) = pareto_point(best_c)
    logger.info(
        "Pareto-optimal point: score=%.3f, c=%d, |pop|=%d, |SNPs|=%d" %
        (best_score, best_c, len(proposed_population), len(proposed_SNP_list)))

    logger.info("Missing SNPs:")
    for SNP in SNP_list:
        if SNP not in proposed_SNP_list:
            logger.info("   %s" % SNP)

    f = open(os.path.join(args.working_dir, 'genotypes_ensembl.csv'), 'w')
    f.write('id')
    for SNP in proposed_SNP_list:
        f.write(',%s' % SNP)
    f.write('\n')
    for person in proposed_population:
        f.write('%s' % person)
        for SNP in proposed_SNP_list:
            f.write(',%s' % (genotypes[SNP][person]))
        f.write('\n')
    f.close()


if __name__ == '__main__':
    args = utils.parse_arguments()
    utils.safe_mkdir(args.working_dir)
    utils.initialize_logger(args)
    refine(args)
Beispiel #3
0
def download_SNPs(args):
    '''
    Extract list of SNPs and extract data for each.  Do it in parallel.
    '''

    logger.info("####################################")
    logger.info("Downloading SNP data from ENSEMBL")

    cache_dir = os.path.join(args.working_dir, 'ensembl_cache')
    utils.safe_mkdir(cache_dir)

    # Input is, e.g., "rs56116432"; output is a dictionary mapping
    # an individual (like '1000GENOMES:phase_3:HG00096') to
    # a genotype at the target SNP (like 'C|C')

    # SNP_plus_list=['rs12103_A','rs6667605_C']
    df_in = pd.read_csv(os.path.join(args.working_dir, 'cleaned_input.csv'))
    SNP_list = [x for x in df_in.columns if x.startswith('rs')]

    logger.info("Start download of genotype data for %d SNPs" %
                (len(SNP_list)))

    # If there are too many cores, we could overwhelm the ENSEMBL server with requests.  So,
    # we throttle the parallelism.
    num_cores = multiprocessing.cpu_count()
    if args.num_workers == -1:
        num_workers = num_cores
    else:
        num_workers = args.num_workers
    server_threshold = 16
    if num_workers > server_threshold:
        logger.info(
            "Throttling down parallelism from %d to %d for ENSEMBL server" %
            (num_workers, server_threshold))
        num_workers = server_threshold

    results = Parallel(n_jobs=num_workers)(
        delayed(grab_individual_genotypes)(SNP, cache_dir) for SNP in SNP_list)

    (chr_list, chr_loc_list, wild_type_list, geno_list) = zip(*results)
    df = pd.DataFrame({
        'SNP': SNP_list,
        'chromosome': chr_list,
        'chromosome_position': chr_loc_list,
        'wild_type': wild_type_list
    })
    df.to_csv(os.path.join(args.working_dir, 'SNP_facts.csv'), index=False)

    # If original data did not specify wild_type, then extract reasonable
    # values from ENSEMBL.
    if not os.path.exists(os.path.join(args.working_dir, 'wild_types.csv')):
        unique_wild_type = []
        for i in xrange(len(wild_type_list)):
            unique_wild_type[i] = wild_type_list[i].split('/')[0]
        df_wild = pd.DataFrame({
            'SNP': SNP_list,
            'wild_type': unique_wild_type
        })
        df_wild.to_csv(os.path.join(args.working_dir, 'wild_types.csv'),
                       index=False)

    genotypes = dict(zip(SNP_list, geno_list))
    outfile = os.path.join(args.working_dir, "ensembl.pkl")
    pickle.dump(genotypes, open(outfile, "wb"))
    logger.info("Download completed.")
Beispiel #4
0
def significant_SNPs(args):
    '''
    Determine which SNPs are actually significant predictors of features.
    '''

    logger.info("####################################")
    logger.info("Classifier for significance.")

    logger.info("SGD iterations: %d" % args.SGD_max_iterations)

    logger.info("Target FDR: %.2f" % args.fdr)

    # Extract list of data labels (i.e., the dependent variables we're trying to
    # predict)
    df_for_field_names = pd.read_csv(
        os.path.join(args.working_dir, 'pruned_experiment.csv'))
    label_fields = [
        field for field in df_for_field_names.columns if field.startswith(args.data_prefix)]
    feature_fields = [
        field for field in df_for_field_names.columns if field.startswith('rs')]
    del df_for_field_names

    logger.info("Num features=%d, num labels=%d" %
                (len(feature_fields), len(label_fields)))

    logger.info('Label fields:')
    logger.info(label_fields)

    utils.safe_mkdir(os.path.join(args.working_dir, 'results'))

    # Do the work (in parallel)
    results = (Parallel(n_jobs=args.num_workers)
               (delayed(single_FDR)(child_num, args.SGD_max_iterations, args, *x)
                for child_num, x in enumerate(itertools.product(label_fields,
                                                                xrange(args.num_knockoff_trials)))))

    summarized = {}
    for (one_label_field, SNP_list_mFDR, SNP_list_cFDR) in results:
        if one_label_field not in summarized:
            summarized[one_label_field] = {'mFDR': {}, 'cFDR': {}}
        for l, fdr in [(SNP_list_mFDR, 'mFDR'), (SNP_list_cFDR, 'cFDR')]:
            for SNP in l:
                if SNP not in summarized[one_label_field][fdr]:
                    summarized[one_label_field][fdr][SNP] = 0
                summarized[one_label_field][fdr][SNP] += 1

    out_fp = open(os.path.join(args.results_dir, 'knockoff_trials.txt'), 'w')
    out_fp.write(
        'Using the HMM knockoff framework, and applying the method %d times\n'
        'with independent knockoff samples, determine which SNPs are significant\n'
        'predictors of which data labels (i.e., dependent variables).\n\n'
        'We examine both a classical FDR (cFDR) and a modified FDR (mFDR),\n'
        'per Candes 2017, Equations 3.10 and 3.11.\n\n' % (
            args.num_knockoff_trials))
    out_fp.write('Target FDR: %.1f%%\n\n' % (100.0 * (args.fdr)))
    out_fp.write(str(datetime.datetime.now()))
    out_fp.write('\n')
    for one_label_field in summarized:
        out_fp.write('Label: %s\n' % one_label_field)
        for fdr in ['mFDR', 'cFDR']:
            out_fp.write('Type of FDR: %s\n' % fdr)
            if len(summarized[one_label_field][fdr]) == 0:
                out_fp.write('  No significant SNPs.\n')
            else:
                sorted_SNPs = sorted(
                    summarized[one_label_field][fdr].items(),
                    key=operator.itemgetter(1), reverse=True)
                for (SNP, count) in sorted_SNPs:
                    percentage = 100.0 * count / args.num_knockoff_trials
                    out_fp.write("   %s : %d%%\n" %
                                 (SNP, np.round(percentage)))
    out_fp.close()

    logger.info('Done with classifier!')
Beispiel #5
0
def stats(args):
    '''
    Compute some simple statistics on the data:
    * Univariate (uncorrected) p-value
    * (Uncorrected) likelihood ratio
    * Bonferroni corrected p-value
    '''

    df = pd.read_csv(os.path.join(args.working_dir, 'cleaned_input.csv'))
    utils.safe_mkdir(os.path.join(args.working_dir, 'results'))

    df_wild = pd.read_csv(os.path.join(args.working_dir, 'wild_types.csv'))
    SNP_to_wild_type = dict(
        zip(df_wild['SNP'].values, df_wild['wild_type'].values))

    # "features" are SNPs
    feature_list = [field for field in df.columns if field.startswith('rs')]
    # "labels" are the dependent variable (e.g., MRI observations)
    label_list = [
        field for field in df.columns if field.startswith(args.data_prefix)
    ]
    N = len(df)

    feature_array = np.zeros((N, len(feature_list)))
    for i, feature in enumerate(feature_list):
        feature_array[:, i] = utils.genotype_to_nonwild_type_count(
            df[feature].values, SNP_to_wild_type[feature])

    label_array = np.zeros((N, len(label_list)))
    for i, label in enumerate(label_list):
        label_array[:, i] = df[label].values[:]

    # The above counts the number of non-wild-type haplotypes, so the values are
    # 0 (wild type diploid), 1, or 2.  To analyze with 2x2 contingency table, we
    # will combine 1 and 2 into a single state, so we either have "diploid wild type"
    # or not.
    feature_array[feature_array == 2] = 1

    # Uncorrected p-value
    with open(os.path.join(args.working_dir, 'results', 'uncorrected.csv'),
              'w') as f:
        f.write(
            'SNP,label,uncorrected_p_value,uncorrected_odds_ratio,'
            'bonferroni_corrected_p_value,empirical_ratio_with_imaging_feature,'
            'empirical_ratio_without_imaging_feature\n')

        contingency_table = np.zeros((2, 2))
        p_raw_array = np.zeros((len(label_list), len(feature_list)))

        logger.info('Bonferroni correction: (%d labels x %d SNPs = %d' %
                    (len(label_list), len(feature_list),
                     len(label_list) * len(feature_list)))

        for label_index, label in enumerate(label_list):
            for feature_index, feature in enumerate(feature_list):
                for label_state in [0, 1]:
                    for feature_state in [0, 1]:
                        contingency_table[feature_state, label_state] = (
                            np.sum(
                                np.logical_and(
                                    feature_array[:, feature_index] ==
                                    feature_state,
                                    df[label].values == label_state)))
                oddsratio, pvalue = fisher_exact(contingency_table)
                p_raw_array[label_index, feature_index] = pvalue
                bonferroni = pvalue * len(feature_list) * len(label_list)
                if bonferroni > 1.0:
                    bonferroni = 1.0

                # Unfortunately, an "imaging feature" is what we call a "label" in the
                # contingency table, not a "feature".
                empirical_ratio_with_feature = '%d/%d' % (contingency_table[
                    1, 1], contingency_table[1, 1] + contingency_table[0, 1])
                empirical_ratio_without_feature = '%d/%d' % (contingency_table[
                    1, 0], contingency_table[1, 0] + contingency_table[0, 0])
                f.write('%s,%s,%f,%f,%f,%s,%s\n' %
                        (feature, label, pvalue, oddsratio, bonferroni,
                         empirical_ratio_with_feature,
                         empirical_ratio_without_feature))
Beispiel #6
0
def prepare_files(args, p_trial_num):
    '''
    Slice and dice the files saved in "preserve_original_files()" and extract
    a random subset for generating one p-value sample.  (This function
    will be called many times.)
    '''

    logger.info('#############################')
    logger.info('#############################')
    logger.info('Preparing next set of files')

    # Remove any old cruft
    file_prefix_list = ['ensembl', 'experiment']
    for file_prefix in file_prefix_list:
        filebase = 'pruned_%s.csv' % file_prefix
        filename_old = os.path.join(args.working_dir, filebase)
        try:
            os.unlink(filename_old)
        except Exception:
            pass
    if False:
        dir_list = ['fastphase_cache', 'knockoffs']
        for d in dir_list:
            try:
                shutil.rmtree(os.path.join(args.working_dir, d),
                              ignore_errors=True)
            except Exception:
                pass

    # Change random seed
    args.random_seed += 10000
    logger.info('Random seed is now %d' % args.random_seed)

    # Create and point to bespoke output directory
    args.results_dir = os.path.join(
        args.working_dir, 'results_%03d' % (p_trial_num))
    utils.safe_mkdir(args.results_dir)

    # Read in real (original) experiment and ENSEMBL data
    orig_dir = os.path.join(args.working_dir, 'original')
    df_experiment = pd.read_csv(os.path.join(
        orig_dir, 'pruned_experiment.csv'))
    df_ensembl = pd.read_csv(os.path.join(orig_dir, 'pruned_ensembl.csv'))

    # Partition off random subset of data to replace experiment data
    num_subjects = len(df_experiment)
    num_ensembl_total = len(df_ensembl)
    assert num_ensembl_total >= num_subjects
    fake_subject_index = np.random.permutation(
        np.arange(num_ensembl_total).astype(int))[:num_subjects]
    for i, j in enumerate(fake_subject_index):
        for col in df_ensembl:
            df_experiment[col].values[i] = df_ensembl[col].values[j]
    remaining_ensembl_index = np.ones(num_ensembl_total).astype(bool)
    remaining_ensembl_index[fake_subject_index] = False
    df_ensembl = df_ensembl.iloc[remaining_ensembl_index].reset_index()
    assert len(df_ensembl) == num_ensembl_total - num_subjects

    # Write new, doctored version of data.
    shutil.copyfile(os.path.join(orig_dir, 'uncorrected.csv'),
                    os.path.join(args.results_dir, 'uncorrected.csv'))
    df_experiment.to_csv(os.path.join(
        args.working_dir, 'pruned_experiment.csv'), index=False)
    df_ensembl.to_csv(os.path.join(
        args.working_dir, 'pruned_ensembl.csv'), index=False)
Beispiel #7
0
def extract_null_distribution(args):
    if args.skip_p_value_accumulation:
        return

    p_dir = os.path.join(args.working_dir, 'p_values')
    utils.safe_mkdir(p_dir)

    if args.download_gcloud:
        logger.info('Downloading p-values from Google cloud.')

        utils.download_prefix_from_gcloud(bucket_name=args.bucket_name,
                                          prefix='p_values/all_results_%d_' % (
                                              args.original_random_seed),
                                          destination_dir=p_dir)
        utils.download_prefix_from_gcloud(bucket_name=args.bucket_name,
                                          prefix='causal_%d/' % (
                                              args.original_random_seed),
                                          destination_dir=args.original_results_dir)
    elif args.download_aws:
        logger.info('AWS S3 download not implemented.')
        pass
    else:
        # Just use local files
        for p_trial_num in xrange(args.p_samples):
            try:
                src_file = os.path.join(args.working_dir,
                                        'results_%03d' % (p_trial_num),
                                        'all_results.csv')
                dst_file = os.path.join(p_dir,
                                        'all_results_%d_%d_%03d.csv' % (
                                            args.original_random_seed,
                                            args.machine_num,
                                            p_trial_num))
                shutil.copyfile(src_file, dst_file)
            except Exception:
                logger.info("Failed to copy %s" % src_file)

    # Figure out how many null-hypothesis samples we *actually* have
    null_hypo_files = [f for f in os.listdir(p_dir)
                       if os.path.isfile(os.path.join(p_dir, f)) and f.startswith('all_results')]
    logger.info('Found %d files for the null hypothesis' %
                (len(null_hypo_files)))
    if len(null_hypo_files) == 0:
        return

    # Peek at an arbitrary file to determine the label types (e.g., "Imaging:
    # Fistula")
    df = pd.read_csv(os.path.join(p_dir, null_hypo_files[0]))
    label_types = np.unique(df.label.values)
    max_obs_freq = []

    # Extract max obs_freq from each file for each label type and each FDR
    # (mFDR vs cFDR)
    for f in null_hypo_files:
        df = pd.read_csv(os.path.join(p_dir, f))
        for label in label_types:
            for fdr in ['mFDR', 'cFDR']:
                index = np.all((df.label.values == label,
                                df.fdr_type.values == fdr), axis=0)
                if np.sum(index) == 0:
                    M = 0
                else:
                    M = np.max(
                        df.iloc[df.label.values == label].obs_freq.values)
                max_obs_freq.append((label, fdr, M))

    utils.safe_mkdir(args.original_results_dir)
    (label_list, fdr_list, max_list) = zip(*max_obs_freq)
    df_null_hypo = pd.DataFrame(
        {'label': label_list, 'fdr': fdr_list, 'max_obs_freq': max_list})
    df_null_hypo.to_csv(os.path.join(
        args.original_results_dir, 'null_hypothesis_max.csv'), index=False)

    # For fun, report the p=0.05 values.
    logger.info("p=%.2f (max) threshold for each label" % (args.p_thresh))
    for label in sorted(list(set(label_list))):
        samples = np.sort(df_null_hypo.iloc[
                          df_null_hypo.label.values == label].max_obs_freq.values)
        index = (1.0 - args.p_thresh) * (1 + len(samples)) - 1
        index = int(np.round(index))
        if index < 0:
            index = 0
        if index >= len(samples):
            index = len(samples) - 1
        v = samples[index]
        logger.info("   %s : %.1f%%" % (label, v))

    # Extract all obs_freq ("q" = "obs_freq")
    # First, extract all obs_freq
    fdr_mode_list = ['mFDR', 'cFDR']
    SNP_dict = {}
    q_dict = {}
    for fdr in fdr_mode_list:
        q_dict[fdr] = {}
        for label in label_list:
            q_dict[fdr][label] = {}
    for f in null_hypo_files:
        df = pd.read_csv(os.path.join(p_dir, f))
        for i in xrange(len(df)):
            fdr = df.fdr_type.values[i]
            label = df.label.values[i]
            SNP = df.SNP.values[i]
            if SNP not in q_dict[fdr][label]:
                q_dict[fdr][label][SNP] = []
                SNP_dict[SNP] = True
            q_dict[fdr][label][SNP].append(
                df.obs_freq.values[i])
    for fdr in fdr_mode_list:
        for label in label_list:
            for SNP in SNP_dict:
                if SNP not in q_dict[fdr][label]:
                    q_dict[fdr][label][SNP] = []
    # Second, add back in zero counts, which had been suppressed
    all_q = {}
    for fdr in fdr_mode_list:
        all_q[fdr] = {}
        for label in label_list:
            all_q[fdr][label] = []
            for SNP in SNP_dict:
                extra = len(null_hypo_files) - len(q_dict[fdr][label][SNP])
                for i in xrange(extra):
                    q_dict[fdr][label][SNP].append(0.0)
                all_q[fdr][label] += q_dict[fdr][label][SNP]

    # If "sig_max.csv" or "sig_results.csv" files are present, add p-values
    for f in ['sig_max', 'sig_results']:
        filename = os.path.join(args.original_results_dir, "%s.csv" % (f))
        try:
            df = pd.read_csv(filename)
        except Exception:
            logger.info("Could not open file %s" % (filename))
            continue
        df['p_value_for_obs_freq'] = 0.0
        for i in xrange(len(df)):
            fdr = df.fdr_type.values[i]
            label = df.label.values[i]
            SNP = df.SNP.values[i]
            q = df.obs_freq.values[i]
            x = np.sort(all_q[fdr][label])
            y = np.linspace(0, 1, len(x))
            yy = 1.0 - np.power(y, len(SNP_dict))
            ii = np.searchsorted(x, q)
            if ii < len(yy):
                # Typical case:
                p = yy[ii]
            else:
                # If causal value exceeds maximum observed null hypothesis
                # (which is a good case)
                p = 0.0

            df['p_value_for_obs_freq'].values[i] = p
        try:
            df.to_csv(filename, index=False)
        except Exception:
            # Apparently, original file is not writable; try renaming output.
            filename = os.path.join(
                args.original_results_dir, "%s_p.csv" % (f))
            df.to_csv(filename, index=False)

        if args.upload_gcloud:
            utils.upload_file_to_gcloud(bucket_name=args.bucket_name,
                                        source_name=filename,
                                        destination_name='causal_%d/%s' % (
                                            args.original_random_seed,
                                            os.path.basename(filename)))

    logger.info("Done constructing null hypothesis p-values")
Beispiel #8
0
def make_all_knockoffs(args):
    '''
    For each chromosome, independently:
       Sort SNPs according to position on genome.
       Train HMM parameters with EM on ENSEMBL data.
       Generate knockoffs of experimentals SNP data.

    For now, we ignore sex of persons, although that is
    available in ENSEMBL
    '''

    logger.info("####################################")
    logger.info("Fitting HMM and generating knockoffs")

    path_to_fp = os.path.join(args.fastPHASE_path, 'fastPHASE')
    if not(os.path.exists(path_to_fp)):
        logger.info("Cannot find fastPHASE at %s" % path_to_fp)
        raise Exception

    cache_dir = os.path.join(args.working_dir, 'fastphase_cache')
    utils.safe_mkdir(cache_dir)

    df_geno_ensembl = pd.read_csv(os.path.join(
        (args.working_dir), 'pruned_ensembl.csv'))

    # SNP,wild_type,chromosome,chromosome_position
    df_SNP = pd.read_csv(os.path.join(
        (args.working_dir), 'pruned_SNP_facts.csv'))
    df_wild = pd.read_csv(os.path.join(args.working_dir, 'wild_types.csv'))
    SNP_to_wild_type = dict(
        zip(df_wild['SNP'].values, df_wild['wild_type'].values))

    chromosome_list = np.sort(np.unique(df_SNP['chromosome']))
    for chromosome in chromosome_list:
        assert chromosome in np.arange(1, 24)

    df_geno_experiment = pd.read_csv(os.path.join(
        (args.working_dir), 'pruned_experiment.csv'))

    # Make sure we have the same SNPs everywhere.
    assert (set([c for c in df_geno_ensembl.columns if c.startswith('rs')]) ==
            set([c for c in df_geno_experiment.columns if c.startswith('rs')]))
    for SNP in df_SNP.SNP.values:
        assert SNP in df_geno_ensembl.columns

    grouped_by_chromosome = df_SNP.groupby('chromosome')
    num_experiment_people = len(df_geno_experiment)

    knockoff_SNP_list = []

    utils.safe_mkdir(os.path.join(args.working_dir, 'knockoffs'))

    em_iterations = 500
    logger.info('Number of EM iterations: %d' % em_iterations)

    for knockoff_trial_count in xrange(args.num_knockoff_trials):
        random_seed = knockoff_trial_count + args.random_seed
        if ((args.num_knockoff_trials <= 20) or
                knockoff_trial_count % ((args.num_knockoff_trials) // 20) == 0):
            logger.info("Knockoff sampling %d of %d" % (
                knockoff_trial_count, args.num_knockoff_trials))

        if False:
            # Serial version; code preserved for debugging purposes
            for chromosome in chromosome_list:
                knockoff_SNP_list.append(
                    make_knockoff(
                        chromosome=chromosome,
                        grouped_by_chromosome=grouped_by_chromosome, df_SNP=df_SNP,
                        df_geno_experiment=df_geno_experiment, df_geno_ensembl=df_geno_ensembl,
                        SNP_to_wild_type=SNP_to_wild_type, cache_dir=cache_dir,
                        path_to_fp=path_to_fp, em_iterations=em_iterations, random_seed=random_seed))
        else:
            knockoff_SNP_list = Parallel(n_jobs=args.num_workers)(
                delayed(make_knockoff)(
                    chromosome=i,
                    grouped_by_chromosome=grouped_by_chromosome, df_SNP=df_SNP,
                    df_geno_experiment=df_geno_experiment, df_geno_ensembl=df_geno_ensembl,
                    SNP_to_wild_type=SNP_to_wild_type, cache_dir=cache_dir, path_to_fp=path_to_fp,
                    em_iterations=em_iterations, random_seed=random_seed)
                for i in chromosome_list)

        # Stitch results for each chromosome back together into a single dataframe
        # Knockoff results
        SNP_columns = [
            x for x in df_geno_ensembl.columns if x.startswith('rs')]
        df_knockoffs = pd.DataFrame(
            columns=SNP_columns, index=np.arange(num_experiment_people))

        # Matched experimental observations + knockoffs in one dataframe
        matched_columns = []
        data_labels = []
        for field in df_geno_experiment.columns:
            if field.startswith('rs'):
                matched_columns.append(field)
                matched_columns.append(field + '_knockoff')
            elif field.startswith(args.data_prefix):
                data_labels.append(field)
            else:
                continue
        df_matched = pd.DataFrame(columns=matched_columns + data_labels,
                                  index=np.arange(num_experiment_people))

        for (X_knockoffs, X_experiment, SNPs_on_chromosome) in knockoff_SNP_list:
            for i in xrange(num_experiment_people):
                for j, SNP in enumerate(SNPs_on_chromosome):
                    df_knockoffs[SNP].values[i] = X_knockoffs[i, j]
                    df_matched[SNP].values[i] = int(X_experiment[i, j])
                    df_matched[
                        SNP + '_knockoff'].values[i] = int(X_knockoffs[i, j])
        for data_label in data_labels:
            df_matched[data_label] = df_geno_experiment[data_label]

        # Sanity check that all fields are filled in.
        for field in df_knockoffs:
            for i in xrange(num_experiment_people):
                assert pd.notnull(df_knockoffs[field].values[i])

        df_matched.to_csv(os.path.join((args.working_dir), 'knockoffs',
                                       'knockoffs_%03d.csv' % knockoff_trial_count),
                          index=False)

    logger.info("Done making knockoffs!!!")