Beispiel #1
0
def main():

	PATH = "/Volumes/biol_imsb_claassen_s1/eiriniar/Data/viSNE/mrd_debarcode"
	mrd_file = os.path.join(PATH, 'mrd_debarcoded.csv')
	healthy_file = os.path.join(PATH, 'healthy_debarcoded.csv')
	control_file = os.path.join(PATH, 'visne_marrow1.csv')
	mrd_data = pd.read_csv(mrd_file, sep=',')
	healthy_data = pd.read_csv(healthy_file, sep=',')
	control_data = pd.read_csv(control_file, sep=',')

	# all available channels
	channels = list(control_data.columns)

	# which markers should be kept for further analysis
	full_labels = ['CD19(Nd142)Di','CD22(Nd143)Di', 'CD47(Nd145)Di','CD79b(Nd146)Di',
					'CD20(Sm147)Di', 'CD34(Nd148)Di','CD179a(Sm149)Di','CD72(Eu151)Di',
					'IgM-i(Eu153)Di','CD45(Sm154)Di','CD10(Gd156)Di',
					'CD179b(Gd158)Di','CD11c(Tb159)Di','CD14(Gd160)Di','CD24(Dy161)Di',
					'CD127(Dy162)Di','TdT(Dy163)Di','CD15(Dy164)Di','Pax5(Ho165)Di',
					'CD38(Er168)Di','CD3(Er170)Di','CD117(Yb171)Di',
					'CD49d(Yb172)Di','CD33(Yb173)Di','HLADR(Yb174)Di','IgM-s(Lu175)Di',
					'CD7(Yb176)Di']

	labels = [label.split('(')[0] for label in full_labels]

	# which columns correspond to the interesting markers
	marker_idx = [channels.index(label) for label in full_labels]

	# keep only interesting markers and arcsinh-transform the data
	x_mrd = ftrans(np.asarray(mrd_data)[:,marker_idx], 5)
	x_healthy = ftrans(np.asarray(healthy_data)[:,marker_idx], 5)
	x_control = ftrans(np.asarray(control_data)[:,marker_idx], 5)

	# select CD10+ blasts
	cd10_idx = np.argsort(x_mrd[:,10])
	x_mrd = x_mrd[cd10_idx[-500:]]
	
	# save the pre-processed dataset
	pickle_dir = os.path.join(cellCnn.__path__[0], 'examples', 'data')
	mkdir_p(pickle_dir)
	pickle_file = os.path.join(pickle_dir, 'ALL.pkl')
	
	data_dict = {'control': x_control,
				 'healthy': x_healthy,
				 'ALL': x_mrd,
				 'labels': labels}
	with open(pickle_file, 'wb') as f:
			pickle.dump(data_dict, f, -1)

	return 0
Beispiel #2
0
def load_fcs_dataset(fcs_info_file, marker_file, co_factor=5):
    """
    Args:
        - fcs_info_file (str) :
          Path to fcs info file that contains the fcs file name and phenotypes.
          The format of this fcs info file looks like: `fcs file name (str)`, `label (int)`.
        - marker_file (str) :
          path to the marker file that contains the name of markers.
        - co_factor (float) :
          the coefficient factor of arcsinh: `x_normalized = arcsinh(co_factor * x)`.
    """
    fcs_info = np.array(pd.read_csv(fcs_info_file, sep=','))
    marker_names = list(pd.read_csv(marker_file, sep=',').columns)
    sample_ids, sample_labels = fcs_info[:, 0], fcs_info[:, 1].astype(int)
    samples, phenotypes = [], []

    fcs_dir = os.path.dirname(fcs_info_file)
    for fcs_file, label in zip(sample_ids, sample_labels):
        fname = os.path.join(fcs_dir, fcs_file)
        fcs = loadFCS(fname, transform=None, auto_comp=False)
        marker_idx = [fcs.channels.index(name) for name in marker_names]
        x = np.asarray(fcs)[:, marker_idx]
        x = ftrans(x, co_factor)
        samples.append(x)
        phenotypes.append(label)
    return samples, phenotypes
Beispiel #3
0
def read_healthy_data(basedir, keys, stimuli, marker_idx):
    lookup = dict()

    for key in keys:
        subdir = os.path.join(basedir, key)
        data_list, stim_list = [], []

        for jj, stim in enumerate(stimuli):
            fname = os.path.join(subdir,
                                 '_'.join([key, stim, 'PhenoGraph.csv']))
            try:

                # load the raw data
                x = fcm.loadFCS(fname, transform=None)
                print[x.channels[ii] for ii in marker_idx]

                # select interesting markers and arcsinh-transform
                x = ftrans(np.asarray(x)[:, marker_idx], 5)

                # merge data from different stimulation conditions
                data_list.append(x)

            except Exception:
                print 'Problem loading: ' + fname
                pass

        lookup[key] = np.vstack(data_list)
    return lookup
Beispiel #4
0
def read_healthy_data(basedir, keys, stimuli, marker_idx):
	lookup = dict()

	for key in keys:
		subdir = os.path.join(basedir, key)
		data_list, stim_list = [], []
	
		for jj, stim in enumerate(stimuli):
			fname = os.path.join(subdir,
								'_'.join([key, stim, 'PhenoGraph.csv']))
			try:

				# load the raw data
				x = fcm.loadFCS(fname, transform=None)
				print [x.channels[ii] for ii in marker_idx]
				
				# select interesting markers and arcsinh-transform
				x = ftrans(np.asarray(x)[:,marker_idx], 5)
				
				# merge data from different stimulation conditions
				data_list.append(x)
				
			except Exception:
				print 'Problem loading: ' + fname 
				pass
Beispiel #5
0
def no_inhibitor_lookup_full(data_path, stimuli, ctypes, marker_idx):
	lookup = dict()
	dose = 'H'
	labels = None
	print stimuli
	
	for key in get_immediate_subdirectories(data_path):
		subdir = os.path.join(data_path, key)
		full_data_list = []
		stim_list, ctype_list = [], []
	
		scaler = StandardScaler(with_std=False)
		for ii, ctype in enumerate(ctypes):
			for jj, stim in enumerate(stimuli):
				
				tu = (key, ctype, dose, stim)
				fname = os.path.join(subdir, '{0}_{1}_{2}{3}.fcs'.format(*tu))
				try:

					# read the .fcs file
					x_full = fcm.loadFCS(fname, transform=None)
					if labels is None:
						labels = [x_full.channels[ii] for ii in marker_idx]

					# keep only interesting markers and arcsinh-transform the data
					x_full = ftrans(np.asarray(x_full)[:,marker_idx], 5)
						
					# fit a mean-shift scaler on control CD4+ T-cells (only on intracellular markers)
					if (ctype == 'cd4+') and (stim == '05'):
						scaler.fit(x_full[:,10:])
						
					# and transform all cell types using this scaler
					x_full[:,10:] = scaler.transform(x_full[:,10:])
						
					# accumulate all the data seen so far along with their labels
					full_data_list.append(x_full)
					stim_list.append(jj * np.ones(x_full.shape[0], dtype=int))
					ctype_list.append([ctype] * x_full.shape[0])

				except Exception: 
					pass

		lookup[key] = {'X': np.vstack(full_data_list),
					   'y': np.hstack(stim_list),
					   'ctype' : flat_list(ctype_list),
					   'labels' : labels,
					   'scaler' : scaler}
	return lookup
Beispiel #6
0
def load_data():
    infofile = os.path.join(BASEDIR, 'clinical_data_flow_repository.csv')
    df = pd.read_csv(infofile, sep='\t')
    data_list = []
    name_list = []
    ytime_l, ystatus_l, id_l = [], [], []

    for ii in df.index:
        ID, y1, y2 = df.iloc[ii]

        # analyze only samples with positive survival times
        if (y1 > 0):

            file1 = os.path.join(PATH_COMP, '_'.join([prefix,
                                                      str(ID), suffix]))

            try:
                # load the raw .fcs data
                X = fcm.loadFCS(file1, transform=None)

                # keep only interesting markers and arcsinh-transform the data
                X = ftrans(np.asarray(X)[:, marker_idx], 150)

                # discard samples with less than 3000 cells
                if X.shape[0] > 3000:
                    print X.shape
                    data_list.append(X)
                    name_list.append(file1)
                    ytime_l.append(y1)
                    ystatus_l.append(y2)
                    id_l.append(ii)

            except Exception:
                print 'Could not find or load sample: ' + file1
                pass

    y = np.hstack([
        np.hstack(ytime_l).reshape(-1, 1),
        np.hstack(ystatus_l).reshape(-1, 1),
        np.hstack(id_l).reshape(-1, 1)
    ])
    return data_list, name_list, y
Beispiel #7
0
def main():

    # stimulation conditions in this experiment
    STIM = [
        'Basal1', 'Basal2', 'AICAR', 'Flt3L', 'G-CSF', 'GM-CSF', 'IFNa',
        'IFNg', 'IL-10', 'IL-27', 'IL-3', 'IL-6', 'PMAiono', 'PVO4', 'SCF',
        'TNFa', 'TPO'
    ]
    full_stim_names = ['_'.join(['NoDrug', stim])
                       for stim in STIM] + ['BEZ-235_Basal1']

    # all available channels in this experiment
    channels = [
        'Time', 'Cell_length', 'DNA1', 'DNA2', 'BC1', 'BC2', 'BC3', 'BC4',
        'BC5', 'BC6', 'pPLCg2', 'CD19', 'p4EBP1', 'CD11b', 'pAMPK', 'pSTAT3',
        'CD34', 'pSTAT5', 'pS6', 'pCREB', 'pc-Cbl', 'CD45', 'CD123', 'pSTAT1',
        'pZap70-Syk', 'CD33', 'CD47', 'pAKT', 'CD7', 'CD15', 'pRb', 'CD44',
        'CD38', 'pErk1-2', 'CD3', 'pP38', 'CD117', 'cCaspase3', 'HLA-DR',
        'CD64', 'CD41', 'Viability', 'PhenoGraph'
    ]

    # which markers should be kept for further analysis
    labels = [
        'CD19', 'CD11b', 'CD34', 'CD45', 'CD123', 'CD33', 'CD47', 'CD7',
        'CD15', 'CD44', 'CD38', 'CD3', 'CD117', 'HLA-DR', 'CD64', 'CD41'
    ]

    # which columns correspond to the interesting markers
    marker_idx = [channels.index(label) for label in labels]

    # data directory
    FCS_DATA_PATH = '/Volumes/biol_imsb_claassen_s1/eiriniar/Data/phenograph_data'

    # read the data from healthy samples
    healthy_keys = ['H' + str(i) for i in range(1, 6)]
    D = read_healthy_data(FCS_DATA_PATH, healthy_keys, full_stim_names,
                          marker_idx)
    aml_dict = {
        'healthy_BM': [(key, D[key]) for key in ['H1', 'H2', 'H3', 'H5', 'H4']]
    }

    # map .txt files back to patient identifiers
    mapping = {
        0: 'SJ10',
        2: 'SJ12',
        3: 'SJ13',
        4: 'SJ14',
        5: 'SJ15',
        6: 'SJ16',
        8: 'SJ1',
        9: 'SJ1',
        10: 'SJ2',
        11: 'SJ2',
        12: 'SJ3',
        13: 'SJ3',
        14: 'SJ4',
        15: 'SJ5',
        17: 'SJ7'
    }

    # read the data from AML samples
    # gated blast populations were downloaded as .txt files from Cytobank
    # CAREFUL when reading these .txt files:
    # they are tab-separated and include an extra first column (cell index)
    AML_files = glob.glob(os.path.join(FCS_DATA_PATH, 'AML_blasts', '*.txt'))

    # only include patients with sufficiently high blast counts
    # (>10% of total cell counts)
    for sj in [0, 2, 3, 4, 5, 6, 8, 9, 10, 11, 12, 13, 14, 15, 17]:
        fname = AML_files[sj]
        t = pd.read_csv(fname, skiprows=1, sep='\t', index_col=0)
        print[list(t.columns)[ii] for ii in marker_idx]
        data_blasts = ftrans(np.asarray(t)[:, marker_idx], 5)
        if mapping[sj] not in aml_dict:
            aml_dict[mapping[sj]] = data_blasts

    # save the pre-processed dataset
    pickle_dir = os.path.join(cellCnn.__path__[0], 'examples', 'data')
    mkdir_p(pickle_dir)
    pickle_file = os.path.join(pickle_dir, 'AML.pkl')
    aml_dict['labels'] = labels
    with open(pickle_file, 'wb') as f:
        pickle.dump(aml_dict, f, -1)

    return 0
Beispiel #8
0
				6:'SJ16', 8:'SJ1', 9:'SJ1', 10:'SJ2', 11:'SJ2',
				12:'SJ3', 13:'SJ3', 14:'SJ4', 15:'SJ5', 17:'SJ7'}
	
	# read the data from AML samples
	# gated blast populations were downloaded as .txt files from Cytobank
	# CAREFUL when reading these .txt files:
	# they are tab-separated and include an extra first column (cell index)
	AML_files = glob.glob(os.path.join(FCS_DATA_PATH, 'AML_blasts', '*.txt'))
   
	# only include patients with sufficiently high blast counts 
	# (>10% of total cell counts)
	for sj in [0,2,3,4,5,6,8,9,10,11,12,13,14,15,17]:
		fname = AML_files[sj]
		t = pd.read_csv(fname, skiprows=1, sep='\t', index_col=0)
		print [list(t.columns)[ii] for ii in marker_idx]
		data_blasts = ftrans(np.asarray(t)[:, marker_idx], 5)
		if mapping[sj] not in aml_dict:
			aml_dict[mapping[sj]] = data_blasts

	
	# save the pre-processed dataset
	pickle_dir = os.path.join(cellCnn.__path__[0], 'examples', 'data')
	mkdir_p(pickle_dir)
	pickle_file = os.path.join(pickle_dir, 'AML.pkl')
	aml_dict['labels'] = labels
	with open(pickle_file, 'wb') as f:
		pickle.dump(aml_dict, f, -1)

	return 0