def main(): PATH = "/Volumes/biol_imsb_claassen_s1/eiriniar/Data/viSNE/mrd_debarcode" mrd_file = os.path.join(PATH, 'mrd_debarcoded.csv') healthy_file = os.path.join(PATH, 'healthy_debarcoded.csv') control_file = os.path.join(PATH, 'visne_marrow1.csv') mrd_data = pd.read_csv(mrd_file, sep=',') healthy_data = pd.read_csv(healthy_file, sep=',') control_data = pd.read_csv(control_file, sep=',') # all available channels channels = list(control_data.columns) # which markers should be kept for further analysis full_labels = ['CD19(Nd142)Di','CD22(Nd143)Di', 'CD47(Nd145)Di','CD79b(Nd146)Di', 'CD20(Sm147)Di', 'CD34(Nd148)Di','CD179a(Sm149)Di','CD72(Eu151)Di', 'IgM-i(Eu153)Di','CD45(Sm154)Di','CD10(Gd156)Di', 'CD179b(Gd158)Di','CD11c(Tb159)Di','CD14(Gd160)Di','CD24(Dy161)Di', 'CD127(Dy162)Di','TdT(Dy163)Di','CD15(Dy164)Di','Pax5(Ho165)Di', 'CD38(Er168)Di','CD3(Er170)Di','CD117(Yb171)Di', 'CD49d(Yb172)Di','CD33(Yb173)Di','HLADR(Yb174)Di','IgM-s(Lu175)Di', 'CD7(Yb176)Di'] labels = [label.split('(')[0] for label in full_labels] # which columns correspond to the interesting markers marker_idx = [channels.index(label) for label in full_labels] # keep only interesting markers and arcsinh-transform the data x_mrd = ftrans(np.asarray(mrd_data)[:,marker_idx], 5) x_healthy = ftrans(np.asarray(healthy_data)[:,marker_idx], 5) x_control = ftrans(np.asarray(control_data)[:,marker_idx], 5) # select CD10+ blasts cd10_idx = np.argsort(x_mrd[:,10]) x_mrd = x_mrd[cd10_idx[-500:]] # save the pre-processed dataset pickle_dir = os.path.join(cellCnn.__path__[0], 'examples', 'data') mkdir_p(pickle_dir) pickle_file = os.path.join(pickle_dir, 'ALL.pkl') data_dict = {'control': x_control, 'healthy': x_healthy, 'ALL': x_mrd, 'labels': labels} with open(pickle_file, 'wb') as f: pickle.dump(data_dict, f, -1) return 0
def load_fcs_dataset(fcs_info_file, marker_file, co_factor=5): """ Args: - fcs_info_file (str) : Path to fcs info file that contains the fcs file name and phenotypes. The format of this fcs info file looks like: `fcs file name (str)`, `label (int)`. - marker_file (str) : path to the marker file that contains the name of markers. - co_factor (float) : the coefficient factor of arcsinh: `x_normalized = arcsinh(co_factor * x)`. """ fcs_info = np.array(pd.read_csv(fcs_info_file, sep=',')) marker_names = list(pd.read_csv(marker_file, sep=',').columns) sample_ids, sample_labels = fcs_info[:, 0], fcs_info[:, 1].astype(int) samples, phenotypes = [], [] fcs_dir = os.path.dirname(fcs_info_file) for fcs_file, label in zip(sample_ids, sample_labels): fname = os.path.join(fcs_dir, fcs_file) fcs = loadFCS(fname, transform=None, auto_comp=False) marker_idx = [fcs.channels.index(name) for name in marker_names] x = np.asarray(fcs)[:, marker_idx] x = ftrans(x, co_factor) samples.append(x) phenotypes.append(label) return samples, phenotypes
def read_healthy_data(basedir, keys, stimuli, marker_idx): lookup = dict() for key in keys: subdir = os.path.join(basedir, key) data_list, stim_list = [], [] for jj, stim in enumerate(stimuli): fname = os.path.join(subdir, '_'.join([key, stim, 'PhenoGraph.csv'])) try: # load the raw data x = fcm.loadFCS(fname, transform=None) print[x.channels[ii] for ii in marker_idx] # select interesting markers and arcsinh-transform x = ftrans(np.asarray(x)[:, marker_idx], 5) # merge data from different stimulation conditions data_list.append(x) except Exception: print 'Problem loading: ' + fname pass lookup[key] = np.vstack(data_list) return lookup
def read_healthy_data(basedir, keys, stimuli, marker_idx): lookup = dict() for key in keys: subdir = os.path.join(basedir, key) data_list, stim_list = [], [] for jj, stim in enumerate(stimuli): fname = os.path.join(subdir, '_'.join([key, stim, 'PhenoGraph.csv'])) try: # load the raw data x = fcm.loadFCS(fname, transform=None) print [x.channels[ii] for ii in marker_idx] # select interesting markers and arcsinh-transform x = ftrans(np.asarray(x)[:,marker_idx], 5) # merge data from different stimulation conditions data_list.append(x) except Exception: print 'Problem loading: ' + fname pass
def no_inhibitor_lookup_full(data_path, stimuli, ctypes, marker_idx): lookup = dict() dose = 'H' labels = None print stimuli for key in get_immediate_subdirectories(data_path): subdir = os.path.join(data_path, key) full_data_list = [] stim_list, ctype_list = [], [] scaler = StandardScaler(with_std=False) for ii, ctype in enumerate(ctypes): for jj, stim in enumerate(stimuli): tu = (key, ctype, dose, stim) fname = os.path.join(subdir, '{0}_{1}_{2}{3}.fcs'.format(*tu)) try: # read the .fcs file x_full = fcm.loadFCS(fname, transform=None) if labels is None: labels = [x_full.channels[ii] for ii in marker_idx] # keep only interesting markers and arcsinh-transform the data x_full = ftrans(np.asarray(x_full)[:,marker_idx], 5) # fit a mean-shift scaler on control CD4+ T-cells (only on intracellular markers) if (ctype == 'cd4+') and (stim == '05'): scaler.fit(x_full[:,10:]) # and transform all cell types using this scaler x_full[:,10:] = scaler.transform(x_full[:,10:]) # accumulate all the data seen so far along with their labels full_data_list.append(x_full) stim_list.append(jj * np.ones(x_full.shape[0], dtype=int)) ctype_list.append([ctype] * x_full.shape[0]) except Exception: pass lookup[key] = {'X': np.vstack(full_data_list), 'y': np.hstack(stim_list), 'ctype' : flat_list(ctype_list), 'labels' : labels, 'scaler' : scaler} return lookup
def load_data(): infofile = os.path.join(BASEDIR, 'clinical_data_flow_repository.csv') df = pd.read_csv(infofile, sep='\t') data_list = [] name_list = [] ytime_l, ystatus_l, id_l = [], [], [] for ii in df.index: ID, y1, y2 = df.iloc[ii] # analyze only samples with positive survival times if (y1 > 0): file1 = os.path.join(PATH_COMP, '_'.join([prefix, str(ID), suffix])) try: # load the raw .fcs data X = fcm.loadFCS(file1, transform=None) # keep only interesting markers and arcsinh-transform the data X = ftrans(np.asarray(X)[:, marker_idx], 150) # discard samples with less than 3000 cells if X.shape[0] > 3000: print X.shape data_list.append(X) name_list.append(file1) ytime_l.append(y1) ystatus_l.append(y2) id_l.append(ii) except Exception: print 'Could not find or load sample: ' + file1 pass y = np.hstack([ np.hstack(ytime_l).reshape(-1, 1), np.hstack(ystatus_l).reshape(-1, 1), np.hstack(id_l).reshape(-1, 1) ]) return data_list, name_list, y
def main(): # stimulation conditions in this experiment STIM = [ 'Basal1', 'Basal2', 'AICAR', 'Flt3L', 'G-CSF', 'GM-CSF', 'IFNa', 'IFNg', 'IL-10', 'IL-27', 'IL-3', 'IL-6', 'PMAiono', 'PVO4', 'SCF', 'TNFa', 'TPO' ] full_stim_names = ['_'.join(['NoDrug', stim]) for stim in STIM] + ['BEZ-235_Basal1'] # all available channels in this experiment channels = [ 'Time', 'Cell_length', 'DNA1', 'DNA2', 'BC1', 'BC2', 'BC3', 'BC4', 'BC5', 'BC6', 'pPLCg2', 'CD19', 'p4EBP1', 'CD11b', 'pAMPK', 'pSTAT3', 'CD34', 'pSTAT5', 'pS6', 'pCREB', 'pc-Cbl', 'CD45', 'CD123', 'pSTAT1', 'pZap70-Syk', 'CD33', 'CD47', 'pAKT', 'CD7', 'CD15', 'pRb', 'CD44', 'CD38', 'pErk1-2', 'CD3', 'pP38', 'CD117', 'cCaspase3', 'HLA-DR', 'CD64', 'CD41', 'Viability', 'PhenoGraph' ] # which markers should be kept for further analysis labels = [ 'CD19', 'CD11b', 'CD34', 'CD45', 'CD123', 'CD33', 'CD47', 'CD7', 'CD15', 'CD44', 'CD38', 'CD3', 'CD117', 'HLA-DR', 'CD64', 'CD41' ] # which columns correspond to the interesting markers marker_idx = [channels.index(label) for label in labels] # data directory FCS_DATA_PATH = '/Volumes/biol_imsb_claassen_s1/eiriniar/Data/phenograph_data' # read the data from healthy samples healthy_keys = ['H' + str(i) for i in range(1, 6)] D = read_healthy_data(FCS_DATA_PATH, healthy_keys, full_stim_names, marker_idx) aml_dict = { 'healthy_BM': [(key, D[key]) for key in ['H1', 'H2', 'H3', 'H5', 'H4']] } # map .txt files back to patient identifiers mapping = { 0: 'SJ10', 2: 'SJ12', 3: 'SJ13', 4: 'SJ14', 5: 'SJ15', 6: 'SJ16', 8: 'SJ1', 9: 'SJ1', 10: 'SJ2', 11: 'SJ2', 12: 'SJ3', 13: 'SJ3', 14: 'SJ4', 15: 'SJ5', 17: 'SJ7' } # read the data from AML samples # gated blast populations were downloaded as .txt files from Cytobank # CAREFUL when reading these .txt files: # they are tab-separated and include an extra first column (cell index) AML_files = glob.glob(os.path.join(FCS_DATA_PATH, 'AML_blasts', '*.txt')) # only include patients with sufficiently high blast counts # (>10% of total cell counts) for sj in [0, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 17]: fname = AML_files[sj] t = pd.read_csv(fname, skiprows=1, sep='\t', index_col=0) print[list(t.columns)[ii] for ii in marker_idx] data_blasts = ftrans(np.asarray(t)[:, marker_idx], 5) if mapping[sj] not in aml_dict: aml_dict[mapping[sj]] = data_blasts # save the pre-processed dataset pickle_dir = os.path.join(cellCnn.__path__[0], 'examples', 'data') mkdir_p(pickle_dir) pickle_file = os.path.join(pickle_dir, 'AML.pkl') aml_dict['labels'] = labels with open(pickle_file, 'wb') as f: pickle.dump(aml_dict, f, -1) return 0
6:'SJ16', 8:'SJ1', 9:'SJ1', 10:'SJ2', 11:'SJ2', 12:'SJ3', 13:'SJ3', 14:'SJ4', 15:'SJ5', 17:'SJ7'} # read the data from AML samples # gated blast populations were downloaded as .txt files from Cytobank # CAREFUL when reading these .txt files: # they are tab-separated and include an extra first column (cell index) AML_files = glob.glob(os.path.join(FCS_DATA_PATH, 'AML_blasts', '*.txt')) # only include patients with sufficiently high blast counts # (>10% of total cell counts) for sj in [0,2,3,4,5,6,8,9,10,11,12,13,14,15,17]: fname = AML_files[sj] t = pd.read_csv(fname, skiprows=1, sep='\t', index_col=0) print [list(t.columns)[ii] for ii in marker_idx] data_blasts = ftrans(np.asarray(t)[:, marker_idx], 5) if mapping[sj] not in aml_dict: aml_dict[mapping[sj]] = data_blasts # save the pre-processed dataset pickle_dir = os.path.join(cellCnn.__path__[0], 'examples', 'data') mkdir_p(pickle_dir) pickle_file = os.path.join(pickle_dir, 'AML.pkl') aml_dict['labels'] = labels with open(pickle_file, 'wb') as f: pickle.dump(aml_dict, f, -1) return 0