Example #1
0
    def test_parse_input(self):
        # the output of results is as follows
        # samp_ids, var_names, df, n_var, n_samp
        results = parse.parse_input(self.f2type, self.samp_var2_fp,
                                    self.startcol2, self.endcol2,
                                    self.delimiter2, self.skip2)
        # testing samp_ids match
        for i in range(len(results[0])):
            assert results[0][i] == [
                '100716FG.C.1.RL', '100804MB.C.1.RL', '100907LG.C.1.RL',
                '101007PC.C.1.RL', '101018WG.N.1.RL', '101019AB.N.1.RL',
                '101026RM.C.1.RL', '101109JD.N.1.RL', '101206DM.N.1.RL',
                '110222MG.C.1.RL', '110228CJ.N.1.RL', '110308DK.C.1.RL',
                '110314CS.N.1.RL', '110330DS.C.1.RL', '110406MB.C.1.RL',
                '110412ET.C.1.RL', '110418ML.C.1.RL', '110420JR.C.1.RL',
                '110502BC.N.1.RL', '110523CB.C.1.RL', '110601OG.C.1.RL',
                '110720BB.C.1.RL', '110727MK.C.1.RL', '110801EH.C.1.RL',
                '110808JB.N.1.RL', '110921AR.C.1.RL', '111003JG.C.1.RL',
                '111115WK.C.1.RL'
            ][i]
        # testing var_names match
        for i in range(len(results[1])):
            assert results[1][i] == ['glutamic_acid', 'glycine'][i]

        # testing n_var and n_samp match
        assert results[3] == 2
        assert results[4] == 28

        results = parse.parse_input(self.f1type, self.samp_var1_fp,
                                    self.startcol1, self.endcol1,
                                    self.delimiter1, self.skip1)
        # testing samp_ids match
        for i in range(len(results[0])):
            assert results[0][i] == [
                '101019AB.N.1.RL', '110228CJ.N.1.RL', '110314CS.N.1.RL',
                '110502BC.N.1.RL', '110808JB.N.1.RL', '101018WG.N.1.RL',
                '101109JD.N.1.RL', '101206DM.N.1.RL', '100907LG.C.1.RL',
                '110308DK.C.1.RL', '110412ET.C.1.RL', '110418ML.C.1.RL',
                '110601OG.C.1.RL', '110720BB.C.1.RL', '110727MK.C.1.RL',
                '110801EH.C.1.RL', '110921AR.C.1.RL', '111003JG.C.1.RL',
                '111115WK.C.1.RL', '100804MB.C.1.RL', '100716FG.C.1.RL',
                '101007PC.C.1.RL', '101026RM.C.1.RL', '110222MG.C.1.RL',
                '110330DS.C.1.RL', '110406MB.C.1.RL', '110420JR.C.1.RL',
                '110523CB.C.1.RL'
            ][i]

        # testing var_names match
        for i in range(len(results[1])):
            assert results[1][i] == [
                'k__Archaea;p__Euryarchaeota;c__Methanobacteria;o__Methanobacteriales;f__Methanobacteriaceae;g__',
                'k__Archaea;p__Euryarchaeota;c__Methanobacteria;o__Methanobacteriales;f__Methanobacteriaceae;g__Methanobacterium',
                'k__Archaea;p__Euryarchaeota;c__Methanobacteria;o__Methanobacteriales;f__Methanobacteriaceae;g__Methanobrevibacter',
                'k__Archaea;p__Euryarchaeota;c__Methanobacteria;o__Methanobacteriales;f__Methanobacteriaceae;g__Methanothermobacter',
                'k__Archaea;p__Euryarchaeota;c__Methanomicrobia;o__Methanocellales;f__Methanocellaceae;g__Methanocella'
            ][i]

        # testing n_var and n_samp match
        assert results[3] == 5
        assert results[4] == 28
Example #2
0
    def test_process_df(self):
        # test processing of dataframes
        samp_ids, var_names, df, n_var, n_samp = parse.parse_input(self.f1type,
            self.samp_var1_fp, self.startcol1, self.endcol1, self.delimiter1, self.skip1)

        results = parse.process_df(df, samp_ids)
        for r in range(len(results)):
            assert_almost_equal(self.process_df_results['1'][r], results[r])

        samp_ids, var_names, df, n_var, n_samp = parse.parse_input(self.f2type,
            self.samp_var2_fp, self.startcol2, self.endcol2, self.delimiter2, self.skip2)

        results = parse.process_df(df, samp_ids)
        for r in range(len(results)):
            assert_almost_equal(self.process_df_results['2'][r], results[r],decimal=-5)
Example #3
0
def calculate_cutie(input_config_fp):
    """
    Computes pairwise correlations between each variable pair and
    for the significant correlations, recomputes correlation for each pair
    after iteratively excluding n observations, differentiating
    true and false correlations on the basis of whether the correlation remains
    significant when each individual observation is dropped
    """
    # unpack config variables
    (samp_var1_fp, delimiter1, samp_var2_fp, delimiter2, f1type, f2type,
     working_dir, skip1, skip2, startcol1, endcol1, startcol2, endcol2, param,
     statistic, corr_compare, resample_k, paired, overwrite, alpha, multi_corr,
     fold, fold_value, graph_bound, fix_axis) = parse.parse_config(input_config_fp)

    # create subfolder to hold data analysis files
    if os.path.exists(working_dir) is not True:
        os.makedirs(working_dir)
    elif overwrite is not True:
        print('Working directory already exists, exiting.')
        sys.exit()

    if os.path.exists(working_dir + 'data_processing') is not True:
        os.makedirs(working_dir + 'data_processing')
    elif overwrite is not True:
        print('Data_processing directory already exists, exiting.')
        sys.exit()

    # initialize and write log file
    start_time = time.process_time()
    log_fp = output.init_log(working_dir, input_config_fp)

    ###
    # Parsing and Pre-processing
    ###

    # define possible stats
    forward_stats = ['pearson',  'spearman', 'kendall']
    reverse_stats = ['rpearson', 'rspearman', 'rkendall']
    all_stats = forward_stats + reverse_stats
    pearson_stats = ['pearson', 'rpearson']
    spearman_stats = ['spearman', 'rspearman']
    kendall_stats = ['kendall', 'rkendall']
    if statistic not in all_stats:
        raise ValueError('Invalid statistic: %s chosen' % statistic)
    if corr_compare and resample_k != 1:
        raise ValueError('Resample_k must be 1 for pointwise stats')

    # file handling and parsing decisions
    # file 1 is the 'dominant' file type and should always contain the OTU file
    # we let the dominant fil 'override' the sample_id list ordering
    samp_ids2, var2_names, samp_var2_df, n_var2, n_samp = parse.parse_input(
        f2type, samp_var2_fp, startcol2, endcol2, delimiter2, skip2)
    output.write_log('The length of variables for file 2 is ' + str(n_var2), log_fp)
    output.write_log('The number of samples for file 2 is ' + str(n_samp), log_fp)
    output.write_log('The md5 of samp_var2 was ' + \
        str(parse.md5_checksum(samp_var2_fp)), log_fp)

    samp_ids1, var1_names, samp_var1_df, n_var1, n_samp = parse.parse_input(
        f1type, samp_var1_fp, startcol1, endcol1, delimiter1, skip1)
    output.write_log('The length of variables for file 1 is ' + str(n_var1), log_fp)
    output.write_log('The number of samples for file 1 is ' + str(n_samp), log_fp)
    output.write_log('The md5 of samp_var1 was ' + \
        str(parse.md5_checksum(samp_var1_fp)), log_fp)

    # if the samp_ids differ, only take common elements
    samp_ids = [value for value in samp_ids1 if value in samp_ids2]
    n_samp = len(samp_ids)

    # subset dataframe, obtain avg and variance
    samp_var1 = parse.process_df(samp_var1_df, samp_ids)
    samp_var2 = parse.process_df(samp_var2_df, samp_ids)

    # printing of samp and var names for reference
    output.write_log('There are ' + str(len(samp_ids)) + ' samples', log_fp)
    output.write_log('The first 3 samples are ' + str(samp_ids[0:3]), log_fp)
    if len(var1_names) >= 3:
        output.write_log('The first 3 var1 are ' + str(var1_names[0:3]), log_fp)
    else:
        output.write_log('Var1 was ' + str(var1_names), log_fp)
    if len(var2_names) >= 3:
        output.write_log('The first 3 var2 are ' + str(var2_names[0:3]), log_fp)
    else:
        output.write_log('Var2 was ' + str(var2_names), log_fp)

    ###
    # Pearson, Spearman, Kendall
    ###
    # initial output
    pvalues, corrs, r2vals = statistics.assign_statistics(samp_var1,
        samp_var2, statistic, pearson_stats, spearman_stats, kendall_stats,
        paired)

    # determine parameter (either r or p)
    output.write_log('The parameter chosen was ' + param, log_fp)

    # determine significance threshold and number of correlations
    if param == 'p':
        output.write_log('The type of mc correction used was ' + multi_corr, log_fp)
    threshold, n_corr, minp = statistics.set_threshold(pvalues, param, alpha,
                                                       multi_corr, paired)
    output.write_log('The threshold value was ' + str(threshold), log_fp)

    # calculate initial sig candidates
    initial_corr, all_pairs = statistics.get_initial_corr(n_var1, n_var2,
        pvalues, corrs, threshold, param, paired)

    # change initial_corr if doing rCUtIe
    if statistic in reverse_stats:
        initial_corr = set(all_pairs).difference(initial_corr)

    output.write_log('The length of initial_corr is ' + str(len(initial_corr)),
        log_fp)

    # if interested in evaluating dffits, dsr, etc.
    region_sets = []
    if corr_compare:
        infln_metrics = ['cutie_1pc', 'cookd', 'dffits', 'dsr']
        infln_mapping = {
            'cutie_1pc': statistics.resample1_cutie_pc,
            'cookd': statistics.cookd,
            'dffits': statistics.dffits,
            'dsr': statistics.dsr
        }
        (FP_infln_sets, region_combs, region_sets) = statistics.pointwise_comparison(
            infln_metrics, infln_mapping, samp_var1, samp_var2, initial_corr,
            threshold, fold_value, fold, param)

        for region in region_combs:
            output.write_log('The amount of unique elements in set ' +
                             str(region) + ' is ' +
                             str(len(region_sets[str(region)])), log_fp)

        # report results
        for metric in infln_metrics:
            metric_FP = FP_infln_sets[metric]
            output.write_log('The number of false correlations according to ' +
                             metric + ' is ' + str(len(metric_FP)), log_fp)
            output.write_log('The number of true correlations according to ' +
                             metric + ' is ' + str(len(initial_corr) - len(metric_FP)),
                             log_fp)

    # return sets of interest; some of these will be empty dicts depending
    # on the statistic
    (true_corr, true_corr_to_rev, false_corr_to_rev, corr_extrema_p,
    corr_extrema_r, samp_counter, var1_counter,
    var2_counter, exceeds_points, rev_points) = statistics.update_cutiek_true_corr(
        initial_corr, samp_var1, samp_var2, pvalues, corrs, threshold,
        statistic, forward_stats, reverse_stats, resample_k, fold, fold_value, param)

    ###
    # Determine indicator matrices
    ###

    # element i,j is -1 if flagged by CUtIe as FP, 1 if TP,
    # and 0 if insig originally
    true_indicators = utils.return_indicators(n_var1, n_var2, initial_corr,
                                        true_corr, resample_k)

    true_rev_indicators = utils.return_indicators(n_var1, n_var2,
        initial_corr, true_corr_to_rev, resample_k)

    false_rev_indicators = utils.return_indicators(n_var1, n_var2,
        initial_corr, false_corr_to_rev, resample_k)

    if corr_compare:
        metric_set_to_indicator = {}
        keys = []
        for region in region_sets:
            temp_dict = {}
            region_truths = set(initial_corr).difference(region_sets[region])
            temp_dict['1'] = region_truths
            metric_set_to_indicator[region] = utils.return_indicators(
                n_var1, n_var2, initial_corr, temp_dict, 1)['1']


    ###
    # Report statistics
    ###

    for k in range(resample_k):
        resample_key = str(k+1)

        # for Spearman and MIC, R2 value stored is same as rho or MIC
        # respectively
        p_ratio = np.divide(corr_extrema_p[resample_key], pvalues)
        r2_ratio = np.divide(corr_extrema_r[resample_key], r2vals)
        variables = [pvalues, corrs, r2vals,
            true_indicators[resample_key], true_rev_indicators[resample_key],
            false_rev_indicators[resample_key], corr_extrema_p[resample_key],
            corr_extrema_r[resample_key], p_ratio, r2_ratio]
        if statistic in forward_stats:
            variable_names = ['pvalues', 'correlations', 'r2vals',
                'indicators','TP_rev_indicators', 'FP_rev_indicators',
                'extreme_p', 'extreme_r', 'p_ratio', 'r2_ratio']
        elif statistic in reverse_stats:
            variable_names = ['pvalues', 'correlations', 'r2vals',
                'indicators', 'FN_rev_indicators', 'TN_rev_indicators',
                'extreme_p', 'extreme_r', 'p_ratio', 'r2_ratio']

        # for pointwise
        if corr_compare:
            variable_names.extend(region_sets)
            for region in region_sets:
                variables.append(metric_set_to_indicator[region])

        # Output results, write summary df
        if statistic in forward_stats:
            summary_df = output.print_summary_df(n_var1, n_var2, variable_names,
                variables, working_dir, resample_key, n_corr, paired)

        elif statistic in reverse_stats:
            summary_df = output.print_summary_df(n_var1, n_var2, variable_names,
                variables, working_dir, resample_key, n_corr, paired)

        output.report_results(initial_corr, true_corr, true_corr_to_rev,
                              false_corr_to_rev, resample_key, log_fp)

    ###
    # Graphing
    ###

    # create subfolder to hold graphing files
    if os.path.exists(working_dir + 'graphs') is not True:
        os.makedirs(working_dir + 'graphs')

    output.graph_subsets(working_dir, var1_names, var2_names, f1type, f2type,
        summary_df, statistic, forward_stats, resample_k, initial_corr,
        true_corr, true_corr_to_rev, false_corr_to_rev, graph_bound, samp_var1,
        samp_var2, all_pairs, region_sets, corr_compare, exceeds_points,
        rev_points, fix_axis)

    output.diag_plots(samp_counter, var1_counter, var2_counter, resample_k,
        working_dir, paired)

    # write log file
    output.write_log('The runtime was ' + str(time.process_time() - start_time), log_fp)
    now = datetime.datetime.now()
    output.write_log('Ended logging at ' + str(now.isoformat()), log_fp)

    return
Example #4
0
def create_json(label, samp_var1_fp, delimiter1, samp_var2_fp, delimiter2,
                f1type, f2type, working_dir, skip, startcol, endcol, statistic,
                resample_k, rm_zero, paired, alpha, mc, stat_names, stat_files,
                log_transform1, log_transform2):

    start_time = time.clock()

    ###
    # Parsing and Pre-processing
    ###

    # create subfolder to hold data analysis files
    if os.path.exists(working_dir + 'data_processing') is not True:
        os.makedirs(working_dir + 'data_processing')

    # file handling and parsing decisions
    # file 1 is the 'dominant' file type and should always contain the OTU file
    # we let the dominant fil 'override' the sample_id list ordering
    samp_ids, var2_names, samp_to_var2, n_var2, n_samp = \
        parse.parse_input(f2type, samp_var2_fp, startcol, endcol, delimiter2, skip)
    samp_ids, var1_names, samp_to_var1, n_var1, n_samp = \
        parse.parse_input(f1type, samp_var1_fp, startcol, endcol, delimiter1, skip)

    # convert dictionaries to matrices
    samp_var1, avg_var1, norm_avg_var1, var_var1, norm_var_var1, skew_var1 = \
        parse.dict_to_matrix(samp_to_var1, samp_ids)
    samp_var2, avg_var2, norm_avg_var2, var_var2, norm_var_var2, skew_var2 = \
        parse.dict_to_matrix(samp_to_var2, samp_ids)

    ###
    # Simple Linear Regression: Spearman and Pearson
    ###
    pearson_stats = ['kpc', 'jkp', 'bsp', 'rpc', 'rjkp', 'rbsp']
    spearman_stats = ['ksc', 'jks', 'bss', 'rsc', 'rjks', 'rbss']
    linear_stats = pearson_stats + spearman_stats

    if statistic in linear_stats:
        # statistic-specific initial output
        stat_to_matrix = statistics.assign_statistics(samp_var1, samp_var2,
                                                      statistic, rm_zero)

        # unpack statistic matrices
        pvalues = stat_to_matrix['pvalues']
        corrs = stat_to_matrix['correlations']
        logpvals = stat_to_matrix['logpvals']
        r2vals = stat_to_matrix['r2vals']

        # determine significance threshold and number of correlations
        threshold, n_corr = statistics.set_threshold(pvalues, alpha, mc,
                                                     paired)

        # calculate initial sig candidates
        initial_sig, all_pairs = statistics.initial_sig_SLR(
            n_var1, n_var2, pvalues, threshold, paired)

    # def output.files_to_sets(stat_names, stat_files)
    # return infln_metrics, FP_infln_sets
    infln_metrics = [str(x) for x in stat_names.split(',')]
    stat_files = [str(x) for x in stat_files.split(',')]
    infln_files = {}
    FP_infln_sets = {}
    counter = 0

    for metric in infln_metrics:
        infln_files[metric] = stat_files[counter]
        FP_infln_sets[metric] = set()
        counter += 1
        with open(infln_files[metric]) as f:
            f.readline()
            for line in f.readlines():
                if line:
                    parts = line.strip().split('\t')
                    point = (int(float(parts[0])), int(float(parts[1])))
                    FP_infln_sets[metric].add(point)

    # this is to test what is picked up by different statistics
    initial_sig = all_pairs

    output.print_json_matrix(n_var1,
                             n_var2,
                             n_corr,
                             infln_metrics,
                             FP_infln_sets,
                             initial_sig,
                             working_dir,
                             paired,
                             point=False)
    output.print_json_matrix(n_var1,
                             n_var2,
                             n_corr,
                             infln_metrics,
                             FP_infln_sets,
                             initial_sig,
                             working_dir,
                             paired,
                             point=True)

    # log transform of data (if log_transform1 or log_transform2 are true)
    if log_transform1 and statistic != 'prop':
        samp_var1 = statistics.log_transform(samp_var1, working_dir, 1)
    if log_transform2 and statistic != 'prop':
        samp_var2 = statistics.log_transform(samp_var2, working_dir, 2)

    # do set operations and determine which is unique to each grouping
    # e.g. comparing jkp3, jkpl, jkpn
    # and comparing jkp3, bsp3, and kpc

    print time.clock() - start_time
    return
Example #5
0
def gen_commands_configs(fold_value, statistic, multi_corr, param, datasets,
                         corr_compare, cutie_fp, working_dir, output_dir):
    data_to_params = {
        'hdac': {
            'samp_var1_fp':
            '/sc/arion/projects/clemej05a/kevin/data/HDAC_data/GSE15222_series_matrix_x100_del62.txt',
            'samp_var2_fp':
            '/sc/arion/projects/clemej05a/kevin/data/HDAC_data/GSE15222_series_matrix_x100_del62.txt',
            'delimiter1': '\\t',
            'delimiter2': '\\t',
            'f1type': 'untidy',
            'f2type': 'untidy',
            'skip1': '0',
            'skip2': '0',
            'startcol1': '-1',
            'endcol1': '-1',
            'startcol2': '-1',
            'endcol2': '-1',
            'paired': 'True',
            'alpha': '0.05'
        },
        'lungtx': {
            'samp_var1_fp':
            '/sc/arion/projects/clemej05a/kevin/data/lungtx_data/otu_table_L6_filt1e3.txt',
            'samp_var2_fp':
            '/sc/arion/projects/clemej05a/kevin/data/lungtx_data/Genes.KEGG.L3.add_counts.txt',
            'delimiter1': '\\t',
            'delimiter2': '\\t',
            'f1type': 'untidy',
            'f2type': 'untidy',
            'skip1': '0',
            'skip2': '0',
            'startcol1': '-1',
            'endcol1': '-1',
            'startcol2': '-1',
            'endcol2': '-1',
            'paired': 'False',
            'alpha': '0.05'
        },
        'lungc': {
            'samp_var1_fp':
            '/sc/arion/projects/clemej05a/kevin/data/pre_sparcc_MSQ/otu_table.MSQ34_L6.txt',
            'samp_var2_fp':
            '/sc/arion/projects/clemej05a/kevin/data/pre_sparcc_MSQ/otu_table.MSQ34_L6.txt',
            'delimiter1': '\\t',
            'delimiter2': '\\t',
            'f1type': 'untidy',
            'f2type': 'untidy',
            'skip1': '1',
            'skip2': '1',
            'startcol1': '-1',
            'endcol1': '-1',
            'startcol2': '-1',
            'endcol2': '-1',
            'paired': 'True',
            'alpha': '0.05'
        },
        'lungpt': {
            'samp_var1_fp':
            '/sc/arion/projects/clemej05a/kevin/data/lungpt_data/otu_table_MultiO_merged___L6.txt',
            'samp_var2_fp':
            '/sc/arion/projects/clemej05a/kevin/data/lungpt_data/Mapping.Pneumotype.Multiomics.RL.NYU.w_metabolites.w_inflamm.txt',
            'delimiter1': '\\t',
            'delimiter2': '\\t',
            'f1type': 'untidy',
            'f2type': 'tidy',
            'skip1': '1',
            'skip2': '0',
            'startcol1': '-1',
            'endcol1': '-1',
            'startcol2': '16',
            'endcol2': '99',
            'paired': 'False',
            'alpha': '0.05'
        },
        'who': {
            'samp_var1_fp':
            '/sc/arion/projects/clemej05a/kevin/data/MINE_data/WHOfix.txt',
            'samp_var2_fp':
            '/sc/arion/projects/clemej05a/kevin/data/MINE_data/WHOfix.txt',
            'delimiter1': '\\t',
            'delimiter2': '\\t',
            'f1type': 'tidy',
            'f2type': 'tidy',
            'skip1': '0',
            'skip2': '0',
            'startcol1': '2',
            'endcol1': '356',
            'startcol2': '2',
            'endcol2': '356',
            'paired': 'True',
            'alpha': '0.05'
        },
        'whonous': {
            'samp_var1_fp':
            '/sc/arion/projects/clemej05a/kevin/data/MINE_data/WHOnous.txt',
            'samp_var2_fp':
            '/sc/arion/projects/clemej05a/kevin/data/MINE_data/WHOnous.txt',
            'delimiter1': '\\t',
            'delimiter2': '\\t',
            'f1type': 'tidy',
            'f2type': 'tidy',
            'skip1': '0',
            'skip2': '0',
            'startcol1': '2',
            'endcol1': '356',
            'startcol2': '2',
            'endcol2': '356',
            'paired': 'True',
            'alpha': '0.05'
        },
        'covidlong0': {
            'samp_var1_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/covid_long_df_early.txt',
            'samp_var2_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/covid_long_df_early.txt',
            'delimiter1': '\\t',
            'delimiter2': '\\t',
            'f1type': 'tidy',
            'f2type': 'tidy',
            'skip1': '0',
            'skip2': '0',
            'startcol1': '-1',
            'endcol1': '-1',
            'startcol2': '-1',
            'endcol2': '-1',
            'paired': 'True',
            'alpha': '0.05'
        },
        'covidlong1': {
            'samp_var1_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/covid_long_df_moderate.txt',
            'samp_var2_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/covid_long_df_moderate.txt',
            'delimiter1': '\\t',
            'delimiter2': '\\t',
            'f1type': 'tidy',
            'f2type': 'tidy',
            'skip1': '0',
            'skip2': '0',
            'startcol1': '-1',
            'endcol1': '-1',
            'startcol2': '-1',
            'endcol2': '-1',
            'paired': 'True',
            'alpha': '0.05'
        },
        'covidlong2': {
            'samp_var1_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/covid_long_df_severe.txt',
            'samp_var2_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/covid_long_df_severe.txt',
            'delimiter1': '\\t',
            'delimiter2': '\\t',
            'f1type': 'tidy',
            'f2type': 'tidy',
            'skip1': '0',
            'skip2': '0',
            'startcol1': '-1',
            'endcol1': '-1',
            'startcol2': '-1',
            'endcol2': '-1',
            'paired': 'True',
            'alpha': '0.05'
        },
        'covidlongfull': {
            'samp_var1_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/covid_long_df_full.txt',
            'samp_var2_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/covid_long_df_full.txt',
            'delimiter1': '\\t',
            'delimiter2': '\\t',
            'f1type': 'tidy',
            'f2type': 'tidy',
            'skip1': '0',
            'skip2': '0',
            'startcol1': '-1',
            'endcol1': '-1',
            'startcol2': '-1',
            'endcol2': '-1',
            'paired': 'True',
            'alpha': '0.05'
        },
        'covidmod': {
            'samp_var1_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/covid_moderate.txt',
            'samp_var2_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/covid_moderate.txt',
            'delimiter1': '\\t',
            'delimiter2': '\\t',
            'f1type': 'tidy',
            'f2type': 'tidy',
            'skip1': '0',
            'skip2': '0',
            'startcol1': '-1',
            'endcol1': '-1',
            'startcol2': '-1',
            'endcol2': '-1',
            'paired': 'True',
            'alpha': '0.05'
        },
        'covidsev': {
            'samp_var1_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/covid_severe.txt',
            'samp_var2_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/covid_severe.txt',
            'delimiter1': '\\t',
            'delimiter2': '\\t',
            'f1type': 'tidy',
            'f2type': 'tidy',
            'skip1': '0',
            'skip2': '0',
            'startcol1': '-1',
            'endcol1': '-1',
            'startcol2': '-1',
            'endcol2': '-1',
            'paired': 'True',
            'alpha': '0.05'
        },
        'baseball': {
            'samp_var1_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/MINE/Baseballfix.txt',
            'samp_var2_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/MINE/Baseballfix.txt',
            'delimiter1': '\\t',
            'delimiter2': '\\t',
            'f1type': 'tidy',
            'f2type': 'tidy',
            'skip1': '0',
            'skip2': '0',
            'startcol1': '-1',
            'endcol1': '-1',
            'startcol2': '-1',
            'endcol2': '-1',
            'paired': 'True',
            'alpha': '0.05'
        },
        'ad0': {
            'samp_var1_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/mennonites/atopic_dz0.csv',
            'samp_var2_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/mennonites/atopic_dz0.csv',
            'delimiter1': ',',
            'delimiter2': ',',
            'f1type': 'tidy',
            'f2type': 'tidy',
            'skip1': '0',
            'skip2': '0',
            'startcol1': '-1',
            'endcol1': '-1',
            'startcol2': '-1',
            'endcol2': '-1',
            'paired': 'True',
            'alpha': '0.05'
        },
        'ad1': {
            'samp_var1_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/mennonites/atopic_dz1.csv',
            'samp_var2_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/mennonites/atopic_dz1.csv',
            'delimiter1': ',',
            'delimiter2': ',',
            'f1type': 'tidy',
            'f2type': 'tidy',
            'skip1': '0',
            'skip2': '0',
            'startcol1': '-1',
            'endcol1': '-1',
            'startcol2': '-1',
            'endcol2': '-1',
            'paired': 'True',
            'alpha': '0.05'
        },
        'oom': {
            'samp_var1_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/mennonites/OOM.csv',
            'samp_var2_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/mennonites/OOM.csv',
            'delimiter1': ',',
            'delimiter2': ',',
            'f1type': 'tidy',
            'f2type': 'tidy',
            'skip1': '0',
            'skip2': '0',
            'startcol1': '-1',
            'endcol1': '-1',
            'startcol2': '-1',
            'endcol2': '-1',
            'paired': 'True',
            'alpha': '0.05'
        },
        'roc': {
            'samp_var1_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/mennonites/ROC.csv',
            'samp_var2_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/mennonites/ROC.csv',
            'delimiter1': ',',
            'delimiter2': ',',
            'f1type': 'tidy',
            'f2type': 'tidy',
            'skip1': '0',
            'skip2': '0',
            'startcol1': '-1',
            'endcol1': '-1',
            'startcol2': '-1',
            'endcol2': '-1',
            'paired': 'True',
            'alpha': '0.05'
        },
        'mennonites': {
            'samp_var1_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/mennonites/df_mennonites.txt',
            'samp_var2_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/mennonites/df_mennonites.txt',
            'delimiter1': '\\t',
            'delimiter2': '\\t',
            'f1type': 'tidy',
            'f2type': 'tidy',
            'skip1': '0',
            'skip2': '0',
            'startcol1': '-1',
            'endcol1': '-1',
            'startcol2': '-1',
            'endcol2': '-1',
            'paired': 'True',
            'alpha': '0.05'
        },
        'covid': {
            'samp_var1_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/df_covid.txt',
            'samp_var2_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/df_covid.txt',
            'delimiter1': '\\t',
            'delimiter2': '\\t',
            'f1type': 'tidy',
            'f2type': 'tidy',
            'skip1': '0',
            'skip2': '0',
            'startcol1': '-1',
            'endcol1': '-1',
            'startcol2': '-1',
            'endcol2': '-1',
            'paired': 'True',
            'alpha': '0.05'
        },
        'airplane': {
            'samp_var1_fp':
            '/sc/arion/projects/clemej05a/kevin/data/MINE_data/2008_data.txt',
            'samp_var2_fp':
            '/sc/arion/projects/clemej05a/kevin/data/MINE_data/2008_data.txt',
            'delimiter1': '\\t',
            'delimiter2': '\\t',
            'f1type': 'tidy',
            'f2type': 'tidy',
            'skip1': '0',
            'skip2': '0',
            'startcol1': '-1',
            'endcol1': '-1',
            'startcol2': '-1',
            'endcol2': '-1',
            'paired': 'True',
            'alpha': '0.05'
        },
        'ici': {
            'samp_var1_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/df_pre.txt',
            'samp_var2_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/df_ici.txt',
            'delimiter1': '\\t',
            'delimiter2': '\\t',
            'f1type': 'tidy',
            'f2type': 'tidy',
            'skip1': '0',
            'skip2': '0',
            'startcol1': '-1',
            'endcol1': '-1',
            'startcol2': '-1',
            'endcol2': '-1',
            'paired': 'False',
            'alpha': '0.05'
        },
        'liverf': {
            'samp_var1_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/liver/df_liver_female_500.txt',
            'samp_var2_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/liver/df_liver_female_500.txt',
            'delimiter1': '\\t',
            'delimiter2': '\\t',
            'f1type': 'untidy',
            'f2type': 'untidy',
            'skip1': '0',
            'skip2': '0',
            'startcol1': '-1',
            'endcol1': '-1',
            'startcol2': '-1',
            'endcol2': '-1',
            'paired': 'True',
            'alpha': '0.05'
        },
        'liverm': {
            'samp_var1_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/liver/df_liver_male_500.txt',
            'samp_var2_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/liver/df_liver_male_500.txt',
            'delimiter1': '\\t',
            'delimiter2': '\\t',
            'f1type': 'untidy',
            'f2type': 'untidy',
            'skip1': '0',
            'skip2': '0',
            'startcol1': '-1',
            'endcol1': '-1',
            'startcol2': '-1',
            'endcol2': '-1',
            'paired': 'True',
            'alpha': '0.05'
        },
        'micro': {
            'samp_var1_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/MINE/Microbiome_fix_500.csv',
            'samp_var2_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/MINE/Microbiome_fix_500.csv',
            'delimiter1': ',',
            'delimiter2': ',',
            'f1type': 'tidy',
            'f2type': 'tidy',
            'skip1': '0',
            'skip2': '0',
            'startcol1': '-1',
            'endcol1': '-1',
            'startcol2': '-1',
            'endcol2': '-1',
            'paired': 'True',
            'alpha': '0.05'
        },
        'hgoral': {
            'samp_var1_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/df_oral.txt',
            'samp_var2_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/df_oral.txt',
            'delimiter1': '\\t',
            'delimiter2': '\\t',
            'f1type': 'tidy',
            'f2type': 'tidy',
            'skip1': '0',
            'skip2': '0',
            'startcol1': '-1',
            'endcol1': '-1',
            'startcol2': '-1',
            'endcol2': '-1',
            'paired': 'True',
            'alpha': '0.05'
        },
        'hgstool': {
            'samp_var1_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/df_stool.txt',
            'samp_var2_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/df_stool.txt',
            'delimiter1': '\\t',
            'delimiter2': '\\t',
            'f1type': 'tidy',
            'f2type': 'tidy',
            'skip1': '0',
            'skip2': '0',
            'startcol1': '-1',
            'endcol1': '-1',
            'startcol2': '-1',
            'endcol2': '-1',
            'paired': 'True',
            'alpha': '0.05'
        },
        'crc': {
            'samp_var1_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/df_CRC_otu.txt',
            'samp_var2_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/df_CRC_cyto.txt',
            'delimiter1': '\\t',
            'delimiter2': '\\t',
            'f1type': 'tidy',
            'f2type': 'tidy',
            'skip1': '0',
            'skip2': '0',
            'startcol1': '-1',
            'endcol1': '-1',
            'startcol2': '-1',
            'endcol2': '-1',
            'paired': 'False',
            'alpha': '0.05'
        },
        'ibd': {
            'samp_var1_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/df_nat_otu.txt',
            'samp_var2_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/df_nat_meta.txt',
            'delimiter1': '\\t',
            'delimiter2': '\\t',
            'f1type': 'tidy',
            'f2type': 'tidy',
            'skip1': '0',
            'skip2': '0',
            'startcol1': '-1',
            'endcol1': '-1',
            'startcol2': '-1',
            'endcol2': '-1',
            'paired': 'False',
            'alpha': '0.0047'
        },
        'cell': {
            'samp_var1_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/df_cell_Al.txt',
            'samp_var2_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/df_cell_Al.txt',
            'delimiter1': '\\t',
            'delimiter2': '\\t',
            'f1type': 'tidy',
            'f2type': 'tidy',
            'skip1': '0',
            'skip2': '0',
            'startcol1': '-1',
            'endcol1': '-1',
            'startcol2': '-1',
            'endcol2': '-1',
            'paired': 'True',
            'alpha': '0.05'
        },
        'nc': {
            'samp_var1_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/natcom_fix.txt',
            'samp_var2_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/natcom_fix.txt',
            'delimiter1': '\\t',
            'delimiter2': '\\t',
            'f1type': 'tidy',
            'f2type': 'tidy',
            'skip1': '0',
            'skip2': '0',
            'startcol1': '-1',
            'endcol1': '-1',
            'startcol2': '-1',
            'endcol2': '-1',
            'paired': 'True',
            'alpha': '0.05'
        },
        'plos': {
            'samp_var1_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/plos_fungi.txt',
            'samp_var2_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/plos_bact.txt',
            'delimiter1': '\\t',
            'delimiter2': '\\t',
            'f1type': 'tidy',
            'f2type': 'tidy',
            'skip1': '0',
            'skip2': '0',
            'startcol1': '-1',
            'endcol1': '-1',
            'startcol2': '-1',
            'endcol2': '-1',
            'paired': 'False',
            'alpha': '0.05'
        },
        'ca': {
            'samp_var1_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/ca_otu.csv',
            'samp_var2_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/ca_plasma.csv',
            'delimiter1': ',',
            'delimiter2': ',',
            'f1type': 'tidy',
            'f2type': 'tidy',
            'skip1': '0',
            'skip2': '0',
            'startcol1': '-1',
            'endcol1': '-1',
            'startcol2': '-1',
            'endcol2': '-1',
            'paired': 'False',
            'alpha': '0.05'
        },
        'statin': {
            'samp_var1_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/bmis/df_combined.csv',
            'samp_var2_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/bmis/df_combined.csv',
            'delimiter1': ',',
            'delimiter2': ',',
            'f1type': 'tidy',
            'f2type': 'tidy',
            'skip1': '0',
            'skip2': '0',
            'startcol1': '-1',
            'endcol1': '-1',
            'startcol2': '-1',
            'endcol2': '-1',
            'paired': 'True',
            'alpha': '0.01128'
        },
        'spatial': {
            'samp_var1_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/spatial/df_f1.csv',
            'samp_var2_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/spatial/df_f1.csv',
            'delimiter1': ',',
            'delimiter2': ',',
            'f1type': 'tidy',
            'f2type': 'tidy',
            'skip1': '0',
            'skip2': '0',
            'startcol1': '-1',
            'endcol1': '-1',
            'startcol2': '-1',
            'endcol2': '-1',
            'paired': 'True',
            'alpha': '0.05'
        },
        'livermfull': {
            'samp_var1_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/liver/df_liver_male.txt',
            'samp_var2_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/liver/df_liver_male.txt',
            'delimiter1': '\\t',
            'delimiter2': '\\t',
            'f1type': 'untidy',
            'f2type': 'untidy',
            'skip1': '0',
            'skip2': '0',
            'startcol1': '-1',
            'endcol1': '-1',
            'startcol2': '-1',
            'endcol2': '-1',
            'paired': 'False',
            'alpha': '0.05',
            'njobs': 1000
        },
        'liverffull': {
            'samp_var1_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/liver/df_liver_female.txt',
            'samp_var2_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/liver/df_liver_female.txt',
            'delimiter1': '\\t',
            'delimiter2': '\\t',
            'f1type': 'untidy',
            'f2type': 'untidy',
            'skip1': '0',
            'skip2': '0',
            'startcol1': '-1',
            'endcol1': '-1',
            'startcol2': '-1',
            'endcol2': '-1',
            'paired': 'False',
            'alpha': '0.05',
            'njobs': 1000
        },
        'microfull': {
            'samp_var1_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/MINE/Microbiome_fix.csv',
            'samp_var2_fp':
            '/sc/arion/projects/clemej05a/kevin/cutie_exp/inputs/MINE/Microbiome_fix.csv',
            'delimiter1': ',',
            'delimiter2': ',',
            'f1type': 'tidy',
            'f2type': 'tidy',
            'skip1': '0',
            'skip2': '0',
            'startcol1': '-1',
            'endcol1': '-1',
            'startcol2': '-1',
            'endcol2': '-1',
            'paired': 'False',
            'alpha': '0.05',
            'njobs': 1000
        },
        'hdacfull': {
            'samp_var1_fp':
            '/sc/arion/projects/clemej05a/kevin/data/HDAC_data/GSE15222_series_matrix_full_del62.txt',
            'samp_var2_fp':
            '/sc/arion/projects/clemej05a/kevin/data/HDAC_data/GSE15222_series_matrix_full_del62.txt',
            'delimiter1': '\\t',
            'delimiter2': '\\t',
            'f1type': 'untidy',
            'f2type': 'untidy',
            'skip1': '0',
            'skip2': '0',
            'startcol1': '-1',
            'endcol1': '-1',
            'startcol2': '-1',
            'endcol2': '-1',
            'paired': 'False',
            'alpha': '0.05',
            'njobs': 10000
        },
    }
    # liverffull, microfull, hdacfull
    # endcol startcol check: lungpt and WHO

    fv = fold_value
    # files = glob.glob(input_dir + '*.txt')
    datasets = datasets.split(',')
    # datasets = ['hdac','lungc','lungpt','who','tx']
    for data in datasets:
        param_to_str = data_to_params[data]

        # example fid: p_nomc_1_mine_False_lungtx
        f_id = '_'.join([param, multi_corr, fv, statistic, corr_compare, data])

        ftype, samp_var_fp, startcol, endcol, delimiter, skip = param_to_str['f1type'], \
            param_to_str['samp_var1_fp'], int(param_to_str['startcol1']), \
            int(param_to_str['endcol1']), param_to_str['delimiter1'], int(param_to_str['skip1'])
        samp_ids, var_names, samp_var_df, n_var, n_samp = parse.parse_input(
            ftype, samp_var_fp, startcol, endcol, delimiter, skip)

        try:
            njobs = param_to_str['njobs']
        except:
            njobs = 1

        # create column tuples
        if njobs > 1:
            # create subtypes
            # samp_var_df is always in tidy format and has already been iloc'd
            dfs = np.array_split(samp_var_df, njobs, axis=1)
            vals = [df.shape[1] for df in dfs]

            col_tuples = [(0, 0)]

            indices = [0]
            indices.extend(vals)
            for i in range(len(indices) - 1):
                t = []
                prev = col_tuples[i]
                t.append(prev[1])
                t.append(indices[i + 1] + prev[1])
                col_tuples.append(t)

            # get rid of 0,0  placeholder
            col_tuples.pop(0)
        else:
            col_tuples = [[param_to_str['startcol2'], param_to_str['endcol2']]]

        for i in range(njobs):
            # sub fid
            if njobs > 1:
                fid = f_id + '_' + str(i)
            else:
                fid = f_id

            # output_dir = '/sc/arion/projects/clemej05a/kevin/data/real_data_analysis/'
            # out_dir = output_dir + f_id + '/'
            out_dir = output_dir + fid + '/'
            try:
                os.makedirs(out_dir)
            except:
                pass
            # working_dir = '/sc/hydra/scratch/buk02/real_data_analysis/'
            working_outdir = working_dir + fid + '/'
            try:
                os.makedirs(working_outdir)
            except:
                pass

            with open(out_dir + 'config_' + fid + '.txt', 'w') as f:
                f.write('[input]')
                f.write('\n')
                f.write('samp_var1_fp: ' + param_to_str['samp_var1_fp'])
                f.write('\n')
                f.write('delimiter1: ' + param_to_str['delimiter2'])
                f.write('\n')
                f.write('samp_var2_fp: ' + param_to_str['samp_var2_fp'])
                f.write('\n')
                f.write('delimiter2: ' + param_to_str['delimiter2'])
                f.write('\n')
                f.write('f1type: ' + param_to_str['f1type'])
                f.write('\n')
                f.write('f2type: ' + param_to_str['f2type'])
                f.write('\n')
                f.write('skip1: ' + param_to_str['skip1'])
                f.write('\n')
                f.write('skip2: ' + param_to_str['skip2'])
                f.write('\n')
                f.write('startcol1: ' + param_to_str['startcol1'])
                f.write('\n')
                f.write('endcol1: ' + param_to_str['endcol1'])
                f.write('\n')
                f.write('startcol2: ' + str(col_tuples[i][0]))
                f.write('\n')
                f.write('endcol2: ' + str(col_tuples[i][1]))
                f.write('\n')
                f.write('paired: ' + param_to_str['paired'])
                f.write('\n')
                f.write('\n')
                f.write('[output]')
                f.write('\n')
                f.write('working_dir: ' + working_outdir)
                f.write('\n')
                f.write('overwrite: True')
                f.write('\n')
                f.write('\n')
                f.write('[stats]')
                f.write('\n')
                f.write('param: ' + param)
                f.write('\n')
                f.write('statistic: ' + statistic)
                f.write('\n')
                f.write('resample_k: 1')
                f.write('\n')
                f.write('alpha: ' + param_to_str['alpha'])
                f.write('\n')
                f.write('mc: ' + multi_corr)
                f.write('\n')
                f.write('fold: True')
                f.write('\n')
                f.write('fold_value: ' + fv)
                f.write('\n')
                f.write('corr_compare: ' + corr_compare)
                f.write('\n')
                f.write('\n')
                f.write('[graph]')
                f.write('\n')
                f.write('graph_bound: 30')
                f.write('\n')
                f.write('fix_axis: False')

            with open(out_dir + 'commands_' + fid + '.txt', 'w') as f:
                f.write('export PYTHONPATH=$PYTHONPATH:/hpc/users/buk02/tools/sandbox/lib/python3.7/site-packages/ && python ' + \
                        cutie_fp + ' -i ' + out_dir + 'config_' + fid + '.txt')