def setup_simple_sorting_data(): """Setup function Creates dataframes to be sorted by gene name Returns: """ dfs = list() for i in TestingFiles.test_files1: cols = file_in.check_col_names(i) dfs.append(file_in.read_csv_data(i, cols)) return dfs
# if v is None: # to_pop.append(k) # for i in to_pop: # cols.pop(i) # df = file_in.strip_data(df, cols.copy()) # expr2_df = pd.concat([expr2_df, df], axis=1) path = '/Users/coltongarelli/Desktop/lpp_nanostring_grouped.csv' if path[-1] == "v": paths = os.path.join(path) else: paths = [os.path.join(path, i) for i in os.listdir(path)] if isinstance(paths, list): for i in paths: cols = file_in.check_col_names(i) df = file_in.read_csv_data(i, cols) if df.empty: pass elif df.shape[1] == 1: master_dfs.append(df) else: master_dfs = df else: cols = file_in.check_col_names(paths) master_dfs = file_in.read_csv_data(paths, cols) # transformed = gene_stats.quantile_norm([expr1_df, expr2_df]) master_dfs.set_index('Probe Name', inplace=True) genes = [ 'CXCL10', 'CXCR3', 'JAK3', 'ISG15', 'HLA-DPA1', 'VCAM1', 'CIITA', "KLRK1", "KLRB1", 'IRAK3', 'SLAMF1'
def test_good_value_cutoff(): """ test cutoff function. FUT should take: a dataframe with some kind of identifier ('symbol') and at least one other column with float values. The name(s) of the column containing float values to be cut off should be passed in as a list FUT should return: a dataframe containing all values from input df below the input threshold """ test_df = file_in.read_csv_data(path=(os.path.join( os.getcwd(), 'testing_files/test_files/cutoff_test_data.csv')), col_names={ 'logfc': 'logfc-1', 'pval': 'adj-pvalue-1', 'symbol': 'symbol' }) actual = modify_dataframes.value_cutoff(upper=0.05, df=test_df, col_name='adj-pvalue-1') ref_df = file_in.read_csv_data(path=(os.path.join( os.getcwd(), 'testing_files/reference_files/05_pvalue_ref.csv')), col_names={ 'logfc': 'logfc-1', 'pval': 'adj-pvalue-1', 'symbol': 'symbol' }) pd.testing.assert_frame_equal( actual, ref_df, # index dtype check_index_type=True, # col names shouldn't be identical check_names=False, # exact numerical values check_exact=True, # ignore order check_like=True) ref_df = file_in.read_csv_data(path=(os.path.join( os.getcwd(), 'testing_files/reference_files/low_pval_cutoff_ref.csv')), col_names={ 'logfc': 'logfc-1', 'pval': 'adj-pvalue-1', 'symbol': 'symbol' }) actual = modify_dataframes.value_cutoff(upper=0.001, df=test_df, col_name='adj-pvalue-1') pd.testing.assert_frame_equal( actual, ref_df, # index dtype check_index_type=True, # col names shouldn't be identical check_names=False, # exact numerical values check_exact=True, # ignore order check_like=True) ref_df = file_in.read_csv_data(path=(os.path.join( os.getcwd(), 'testing_files/reference_files/logfc2_cutoff_ref.csv')), col_names={ 'logfc': 'logfc-1', 'pval': 'adj-pvalue-1', 'symbol': 'symbol' }) actual = modify_dataframes.value_cutoff(upper=2, lower=-2, df=test_df, col_name='logfc-1') pd.testing.assert_frame_equal( actual, ref_df, # index dtype check_index_type=True, # col names shouldn't be identical check_names=False, # exact numerical values check_exact=True, # ignore order check_like=True) ref_df = file_in.read_csv_data(path=(os.path.join( os.getcwd(), 'testing_files/reference_files/logfc4_cutoff_ref.csv')), col_names={ 'logfc': 'logfc-1', 'pval': 'adj-pvalue-1', 'symbol': 'symbol' }) actual = modify_dataframes.value_cutoff(upper=4, lower=-4, df=test_df, col_name='logfc-1') pd.testing.assert_frame_equal( actual, ref_df, # index dtype check_index_type=True, # col names shouldn't be identical check_names=False, # exact numerical values check_exact=True, # ignore order check_like=True)
def make_test_df(path): cols = file_in.check_col_names(path) df = file_in.read_csv_data(path, cols) return df
import data_processing.file_in as file_in import Source.Visualization.heatmap as heatmap import os import matplotlib.pyplot as plt master_dfs = list() path = '/Users/coltongarelli/Desktop/Geo data for comps/working files/all_lupus.csv' # files = [os.path.join(path, i) for i in os.listdir(path)] file = path # for i in files: cols = file_in.check_col_names(file) df = file_in.read_csv_data(file, cols) hm = heatmap.sns_clustermap(df) hm.savefig(os.path.join("/Users/coltongarelli/Desktop/Richmond Lab/Plots/Canine plots/", "{}_clustermap.png".format("Lupus"))) master_dfs.append(df) # genes_of_interest = ["IL16", "TSLP", "CXCL8", "CXCL10", # "CXCL11", "CXCL12", "CCL20", "CXCL13", "CXCL16", # "CXCL14", "CX3CL1", "CCL1", "CCL2", # "CCL3", "CCL4", "CCL5", "CCL7", "CCL8", # "CCL13", "CCL23", "CCL16", "CCL17", "CCL19", # "CCL20", "CCL21", "CCL22", "CCL24", "CCL25", # "CCL26", "CCL27", "CCL28", "PPBP", "CXCL17"] # genes_of_interest = pd.read_csv('/Users/coltongarelli/Desktop/Geo data for comps/working files/panels.csv') # # genes = master_dfs[0].index.to_list() # # genes_to_keep = [gene for gene in genes if gene in genes_of_interest]
import pandas as pd import ntpath import matplotlib.pyplot as plt master_dfs = list() paths = "/Users/coltongarelli/Desktop/GEO data for comps/Canine Nanostring data/dle vs ref with pvalue.csv" # paths = input("input the path of a file containing file names for data to be analyzed, or hit enter: ") if paths is "": paths = [os.path.join("/Users/coltongarelli/Dropbox/toptables/scle_toptable_abs-logfc-1.csv"), os.path.join("/Users/coltongarelli/Dropbox/toptables/acle_toptable_abs-logfc-1.csv"), os.path.join("/Users/coltongarelli/Dropbox/toptables/dle_toptable_abs-logfc-1.csv")] if isinstance(paths, list): for i in paths: cols = file_in.check_col_names(i, pvalue=True) df = file_in.read_csv_data(i, cols) # drop duplicate values df = modify_dataframes.remove_duplicate_indicies(df) # remove unneeded data df = file_in.strip_data(df, cols.copy()) # add to the master list of dfs master_dfs.append(df) else: cols = file_in.check_col_names(paths, pvalue=True) df = file_in.read_csv_data(paths, cols) df = modify_dataframes.remove_duplicate_indicies(df) # df = file_in.strip_data(df, cols.copy()) master_dfs = df.copy(deep=True) counter = 0 working_dfs = list()
"/Users/coltongarelli/Desktop/GEO data for comps/B19-1182_06-logfc.csv" ), os.path.join( "/Users/coltongarelli/Desktop/GEO data for comps/GSE113113_late_avg.csv" ), os.path.join( "/Users/coltongarelli/Desktop/GEO data for comps/GSE113113_early_avg.csv" ) ] # os.path.join("/Users/coltongarelli/Desktop/GEO data for comps/formatted/GSE81071_scle_healthyvscle_formatted.csv"), # os.path.join("/Users/coltongarelli/Desktop/GEO data for comps/formatted/GSE81071_cdle_healthyvdle_formatted.csv") # paths = [os.path.join('/Users/coltongarelli/Desktop/Geo data for comps/working files/all_lupus.csv')] for i in paths: cols = file_in.check_col_names(i) df = file_in.read_csv_data(i, cols) # df = file_in.make_unique_index(df) # drop duplicate values df = modify_dataframes.remove_duplicate_indicies(df) # remove unneeded data to_pop = list() # TODO remove requirement for pvalue (or other empty required field)....should probably think of a better soln for k, v in cols.items(): if v is None: to_pop.append(k) for i in to_pop: cols.pop(i) # df = file_in.strip_data(df, cols.copy()) master_dfs.append(df) working_dfs = [i.copy(deep=True) for i in master_dfs]