Beispiel #1
0
def setup_simple_sorting_data():
    """Setup function
    Creates dataframes to be sorted by gene name

    Returns:

    """
    dfs = list()
    for i in TestingFiles.test_files1:
        cols = file_in.check_col_names(i)
        dfs.append(file_in.read_csv_data(i, cols))
    return dfs
Beispiel #2
0
#         if v is None:
#             to_pop.append(k)
#     for i in to_pop:
#         cols.pop(i)
#     df = file_in.strip_data(df, cols.copy())
#     expr2_df = pd.concat([expr2_df, df], axis=1)

path = '/Users/coltongarelli/Desktop/lpp_nanostring_grouped.csv'
if path[-1] == "v":
    paths = os.path.join(path)
else:
    paths = [os.path.join(path, i) for i in os.listdir(path)]
if isinstance(paths, list):
    for i in paths:
        cols = file_in.check_col_names(i)
        df = file_in.read_csv_data(i, cols)
        if df.empty:
            pass
        elif df.shape[1] == 1:
            master_dfs.append(df)
        else:
            master_dfs = df
else:
    cols = file_in.check_col_names(paths)
    master_dfs = file_in.read_csv_data(paths, cols)

# transformed = gene_stats.quantile_norm([expr1_df, expr2_df])
master_dfs.set_index('Probe Name', inplace=True)
genes = [
    'CXCL10', 'CXCR3', 'JAK3', 'ISG15', 'HLA-DPA1', 'VCAM1', 'CIITA', "KLRK1",
    "KLRB1", 'IRAK3', 'SLAMF1'
Beispiel #3
0
def test_good_value_cutoff():
    """
    test cutoff function.
    FUT should take:
        a dataframe with some kind of identifier ('symbol') and at least one
        other column with float values. The name(s) of the column containing float
        values to be cut off should be passed in as a list
    FUT should return:
        a dataframe containing all values from input df below the input threshold
    """
    test_df = file_in.read_csv_data(path=(os.path.join(
        os.getcwd(), 'testing_files/test_files/cutoff_test_data.csv')),
                                    col_names={
                                        'logfc': 'logfc-1',
                                        'pval': 'adj-pvalue-1',
                                        'symbol': 'symbol'
                                    })
    actual = modify_dataframes.value_cutoff(upper=0.05,
                                            df=test_df,
                                            col_name='adj-pvalue-1')
    ref_df = file_in.read_csv_data(path=(os.path.join(
        os.getcwd(), 'testing_files/reference_files/05_pvalue_ref.csv')),
                                   col_names={
                                       'logfc': 'logfc-1',
                                       'pval': 'adj-pvalue-1',
                                       'symbol': 'symbol'
                                   })
    pd.testing.assert_frame_equal(
        actual,
        ref_df,
        # index dtype
        check_index_type=True,
        # col names shouldn't be identical
        check_names=False,
        # exact numerical values
        check_exact=True,
        # ignore order
        check_like=True)
    ref_df = file_in.read_csv_data(path=(os.path.join(
        os.getcwd(), 'testing_files/reference_files/low_pval_cutoff_ref.csv')),
                                   col_names={
                                       'logfc': 'logfc-1',
                                       'pval': 'adj-pvalue-1',
                                       'symbol': 'symbol'
                                   })
    actual = modify_dataframes.value_cutoff(upper=0.001,
                                            df=test_df,
                                            col_name='adj-pvalue-1')

    pd.testing.assert_frame_equal(
        actual,
        ref_df,
        # index dtype
        check_index_type=True,
        # col names shouldn't be identical
        check_names=False,
        # exact numerical values
        check_exact=True,
        # ignore order
        check_like=True)
    ref_df = file_in.read_csv_data(path=(os.path.join(
        os.getcwd(), 'testing_files/reference_files/logfc2_cutoff_ref.csv')),
                                   col_names={
                                       'logfc': 'logfc-1',
                                       'pval': 'adj-pvalue-1',
                                       'symbol': 'symbol'
                                   })
    actual = modify_dataframes.value_cutoff(upper=2,
                                            lower=-2,
                                            df=test_df,
                                            col_name='logfc-1')
    pd.testing.assert_frame_equal(
        actual,
        ref_df,
        # index dtype
        check_index_type=True,
        # col names shouldn't be identical
        check_names=False,
        # exact numerical values
        check_exact=True,
        # ignore order
        check_like=True)

    ref_df = file_in.read_csv_data(path=(os.path.join(
        os.getcwd(), 'testing_files/reference_files/logfc4_cutoff_ref.csv')),
                                   col_names={
                                       'logfc': 'logfc-1',
                                       'pval': 'adj-pvalue-1',
                                       'symbol': 'symbol'
                                   })
    actual = modify_dataframes.value_cutoff(upper=4,
                                            lower=-4,
                                            df=test_df,
                                            col_name='logfc-1')
    pd.testing.assert_frame_equal(
        actual,
        ref_df,
        # index dtype
        check_index_type=True,
        # col names shouldn't be identical
        check_names=False,
        # exact numerical values
        check_exact=True,
        # ignore order
        check_like=True)
Beispiel #4
0
def make_test_df(path):
    cols = file_in.check_col_names(path)
    df = file_in.read_csv_data(path, cols)
    return df
Beispiel #5
0
import data_processing.file_in as file_in
import Source.Visualization.heatmap as heatmap
import os
import matplotlib.pyplot as plt


master_dfs = list()
path = '/Users/coltongarelli/Desktop/Geo data for comps/working files/all_lupus.csv'
# files = [os.path.join(path, i) for i in os.listdir(path)]
file = path
# for i in files:
cols = file_in.check_col_names(file)
df = file_in.read_csv_data(file, cols)
hm = heatmap.sns_clustermap(df)
hm.savefig(os.path.join("/Users/coltongarelli/Desktop/Richmond Lab/Plots/Canine plots/", "{}_clustermap.png".format("Lupus")))


master_dfs.append(df)

# genes_of_interest = ["IL16", "TSLP", "CXCL8", "CXCL10",
#                      "CXCL11", "CXCL12", "CCL20", "CXCL13", "CXCL16",
#                      "CXCL14", "CX3CL1", "CCL1", "CCL2",
#                      "CCL3", "CCL4", "CCL5", "CCL7", "CCL8",
#                      "CCL13", "CCL23", "CCL16", "CCL17", "CCL19",
#                      "CCL20", "CCL21", "CCL22", "CCL24", "CCL25",
#                      "CCL26", "CCL27", "CCL28", "PPBP", "CXCL17"]
# genes_of_interest = pd.read_csv('/Users/coltongarelli/Desktop/Geo data for comps/working files/panels.csv')
# # genes = master_dfs[0].index.to_list()
# # genes_to_keep = [gene for gene in genes if gene in genes_of_interest]

import pandas as pd
import ntpath
import matplotlib.pyplot as plt

master_dfs = list()
paths = "/Users/coltongarelli/Desktop/GEO data for comps/Canine Nanostring data/dle vs ref with pvalue.csv"
# paths = input("input the path of a file containing file names for data to be analyzed, or hit enter: ")
if paths is "":
    paths = [os.path.join("/Users/coltongarelli/Dropbox/toptables/scle_toptable_abs-logfc-1.csv"),
             os.path.join("/Users/coltongarelli/Dropbox/toptables/acle_toptable_abs-logfc-1.csv"),
             os.path.join("/Users/coltongarelli/Dropbox/toptables/dle_toptable_abs-logfc-1.csv")]

if isinstance(paths, list):
    for i in paths:
        cols = file_in.check_col_names(i, pvalue=True)
        df = file_in.read_csv_data(i, cols)
        # drop duplicate values
        df = modify_dataframes.remove_duplicate_indicies(df)
        # remove unneeded data
        df = file_in.strip_data(df, cols.copy())
        # add to the master list of dfs
        master_dfs.append(df)
else:
    cols = file_in.check_col_names(paths, pvalue=True)
    df = file_in.read_csv_data(paths, cols)
    df = modify_dataframes.remove_duplicate_indicies(df)
    # df = file_in.strip_data(df, cols.copy())
    master_dfs = df.copy(deep=True)

counter = 0
working_dfs = list()
            "/Users/coltongarelli/Desktop/GEO data for comps/B19-1182_06-logfc.csv"
        ),
        os.path.join(
            "/Users/coltongarelli/Desktop/GEO data for comps/GSE113113_late_avg.csv"
        ),
        os.path.join(
            "/Users/coltongarelli/Desktop/GEO data for comps/GSE113113_early_avg.csv"
        )
    ]
    # os.path.join("/Users/coltongarelli/Desktop/GEO data for comps/formatted/GSE81071_scle_healthyvscle_formatted.csv"),
    # os.path.join("/Users/coltongarelli/Desktop/GEO data for comps/formatted/GSE81071_cdle_healthyvdle_formatted.csv")

# paths = [os.path.join('/Users/coltongarelli/Desktop/Geo data for comps/working files/all_lupus.csv')]
for i in paths:
    cols = file_in.check_col_names(i)
    df = file_in.read_csv_data(i, cols)
    # df = file_in.make_unique_index(df)
    # drop duplicate values
    df = modify_dataframes.remove_duplicate_indicies(df)
    # remove unneeded data
    to_pop = list()
    # TODO remove requirement for pvalue (or other empty required field)....should probably think of a better soln
    for k, v in cols.items():
        if v is None:
            to_pop.append(k)
    for i in to_pop:
        cols.pop(i)
    # df = file_in.strip_data(df, cols.copy())
    master_dfs.append(df)

working_dfs = [i.copy(deep=True) for i in master_dfs]