def pre_process(cFile, points=500, channels=('FITC-A', 'PE-Texas Red-A'), fcs=True): """Pre-processes datasets to usable forms.""" text_name = cFile.split('.')[0] + ".txt" if fcs: sample = FCMeasurement(ID="sample", datafile=cFile) allowed = list(sample.meta['_channel_names_']) for chan in channels: if chan not in allowed: raise ValueError('%s is not a valid channel. Valid channels for %s include:\n%s' % (chan, cFile, str(allowed))) data_columns = (sample.data[meas].values for meas in channels) np.savetxt(text_name, np.column_stack(data_columns), delimiter='\t', fmt="%.2f") #Open file and check for conformation to format. f = open(text_name, 'r') fData = f.readlines() f.close() if len(fData[0].strip('\n').split('\t')) != len(channels): raise IOError('Input files must be %s-column, tab-delimited text files.' % str(len(channels))) #Parse data files, generate random points. signals = {n: np.array([float(x.strip('\n').split('\t')[n]) for x in fData]) for n in range(len(channels))} indices = random.sample(range(signals[0].size), points) randsignals = {k: v[indices] for k, v in signals.iteritems()} #Add to array. sig_values = tuple(randsignals.values()) darray = np.vstack(sig_values).T return darray
def get_PCA_FCS(temp_file_path): datafile= temp_file_path sample = FCMeasurement(ID='', datafile=datafile) hlog_comp_array = get_compensated_array(sample) pca_fracs,pca_Wt = get_PCA(hlog_comp_array) return (pca_fracs,pca_Wt)
def _load(self): for file in os.listdir(self.datadir): if file.endswith(".fcs") and self.pattern in file: print(file) self.samples.append( FCMeasurement(ID=file, datafile=self.datadir + "/" + file)) return self.samples
def path_2_sample(path, id_name, transform=False): """Gets a measurement from the given fcs file path, transforming its scatter values using a hyperlog transformation if specified.""" sample = FCMeasurement(ID=id_name, datafile=path) if transform: return transform_sample(sample) return sample
def __init__(self, file_address, if_fcs=True, if_drop=True, drop_col=['Time']): """ Read the fcs file as pd.Dataframe Parameters ---------- file_address : string e.g. r'#40 Ab.fcs' or 'flowmetry.csv' if_fcs : bool whethe the imput file is fcs file. If not, it should be a csv file if_drop : bool define whether some columns should be ignored drop_col : list of strings list of column names to be dropped """ if if_fcs: self.info = FCMeasurement(ID='Train', datafile=file_address) df = self.info.data else: df = pd.read_csv(file_address) self.df = df if if_drop: self.df = df.drop(drop_col, axis=1)
def predict(self, file_id): dt = datetime.now() self.response['ts'] = dt.microsecond self.response['file_name'] = file_id fcs_file_name = file_id datafile = os.path.join(SHARED_RAW_DIR, fcs_file_name) sample = FCMeasurement(ID='Test Sample', datafile=datafile) df = sample.data df = self.pre_process(df) X = df.iloc[:, 0:len(PRINCIPAL_COMPONENTS)].values # Scale the data - feature scaling sc = StandardScaler() X = sc.fit_transform(X) predictions = {} counts = {} model_names = ['Logistic Regression', 'Decision Tree', 'Random Forest'] for i in range(len(self.models_array)): a = self.models_array[i].predict(X) unique_elements, counts_elements = np.unique(a, return_counts=True) unique_elements = unique_elements.tolist() counts_elements = counts_elements.tolist() predictions['elements'] = unique_elements counts[model_names[i]] = dict(zip(unique_elements, counts_elements)) predictions['counts'] = counts self.response['predictions'] = predictions return self.response
def prep_fcs(file_path, mosaic_object): """ :param file_path: Path to FCS file :param mosaic_object: :return: prepped FCS data, as a pandas dataframe """ # data import all_fcs = FCMeasurement(ID='A1l', datafile=file_path) data = all_fcs.data data = data[[mosaic_object.fsc, mosaic_object.ssc, mosaic_object.fl1]] # remove zero elements data = data.loc[data[mosaic_object.fl1] > 0] # toggle for linear and log data if mosaic_object.amplification: data[mosaic_object.fl1] = data[mosaic_object.fl1].apply(math.log) # run model fsc_ecdf = ECDF(data[mosaic_object.fsc]) data[mosaic_object.fsc] = fsc_ecdf(data[mosaic_object.fsc]) ssc_ecdf = ECDF(data[mosaic_object.ssc]) data[mosaic_object.ssc] = ssc_ecdf(data[mosaic_object.ssc]) sub_filter1 = (data[mosaic_object.fsc] >= mosaic_object.fsc_filt[0]) sub_filter2 = (data[mosaic_object.fsc] <= mosaic_object.fsc_filt[1]) sub_data = data.loc[sub_filter1 & sub_filter2] sub_filter3 = sub_data[mosaic_object.ssc] >= mosaic_object.ssc_filt[0] sub_filter4 = sub_data[mosaic_object.ssc] <= mosaic_object.ssc_filt[1] sub_data = sub_data.loc[sub_filter3 & sub_filter4] return sub_data
def conv_append(self, tube, caseNum, tubeNum): sample = FCMeasurement(ID='Test Sample', datafile=tube) sample_numpy = sample.data.values self.channel_length = len(sample.channels) - 1 for x in range(self.cells_per_tube): for y in range(self.channel_length): self.conv_dataset[caseNum][y][tubeNum * self.cells_per_tube + x] = sample_numpy[self.buffer + x][y]
def load_file(File_name): Path = File_name sample = FCMeasurement(ID='Test Sample', datafile=Path) FSC=NP.array(sample.data[['FSC-H']]) # Forward scatter SSC=NP.array(sample.data[['SSC-H']]) # Size scatter GFP=NP.array(sample.data[['FL1-H']]) #GFP current_data=NP.array(sample.data[['FSC-H','FL1-H','SSC-H']]) # matrix that contains all of them sample_id = sample.meta[u'SAMPLE ID'] return current_data,sample_id
def compile_untreated(date, cellFrac): """Adds all data from a single patient to an FC file""" pathname = join(path_here, "ckine/data/Flow_Data_Meyer/" + date + "/Untreated/") pathlist = Path(r"" + str(pathname)).glob("**/*.fcs") FCfiles = [] for path in pathlist: FCfiles.append(FCMeasurement(ID="All Data", datafile=path)) return combineWells(FCfiles, cellFrac, date)
def importFlow(directory): toImport = os.listdir(directory) toImport = [entry for entry in toImport if entry[-4:] == '.fcs'] samples = [ FCMeasurement(ID=entry, datafile=directory + '/' + entry) for entry in toImport ] return samples
def compute_median_log(dirr): files = sorted_nicely(glob(dirr + "*.fcs")) all_samples = [FCMeasurement(ID=f, datafile=f) for f in files] x = map(lambda sample: np.median(clean_sample(sample)), all_samples) x = np.reshape(x, [8, 12]) x = np.fliplr(x) x = np.flipud(x) x += 1. x = x.T x /= x[0, 0] return (x)
def FCScheck(self, datafile): filematch = re.compile( "PANEL[\s_][AB][\s_][A][A-Z][A-Z][A-Z][\s_][0-9]{1,5}", flags=re.X | re.I) rawfilename = datafile.split("/")[-1] filesampleID = "_".join(rawfilename.split("_")[2:4]) #sampleID=str(sampleID.strip().lstrip("(u\'")) codeonly = re.compile("[A][A-Z][A-Z][A-Z]_[0-9]{1,4}") controlonly = re.compile("[B][6][N][C]_[0-9]{1,4}") omitcontrol = re.compile( "(^STAINED)|(Control)|(FMO)|(^UNSTAINED)|(^Specimen)|(,3a,)", flags=re.X | re.I) compFileNameA = re.compile("[A-z_]*PANELA$", flags=re.X | re.I) compFileNameB = re.compile("[A-z_]*PANELB$", flags=re.X | re.I) compFiles = re.compile("Compensation", flags=re.X | re.I) FluoroMatch = re.compile( """Compensation(\s|-|_){1,3}Controls(\s|-|_){0,3}( (CD25(\s|-|_){0,2}PE(\s|-|_){0,2}CY7)| (BV421)| (BV510)| (BV786)| (FITC)| (PE)| (APC)| (CD8a(\s|-|_){0,2}PE(\s|-|_){0,2}CF594)| (CD11b(\s|-|_){0,2}PE(\s|-|_){0,2}CF594)| (CD11C(\s|-|_){0,2}PE(\s|-|_){0,2}CY7)| (CD62L(\s|-|_){0,2}APC(\s|-|_){0,2}CY7)| (MHCII(\s|-|_){0,2}APC(\s|-|_){0,2}CY7) )""", flags=re.X | re.I) fullIMPC = re.compile("IMPC[12][\s_-]") sample = FCMeasurement(ID="Tops", datafile=datafile) mdata = sample.meta.keys() sampleID = sample.meta["TUBE NAME"] sampleID = str(sampleID.strip().lstrip("(u\'")) FullName = sample.meta["$FIL"] panel = sample.meta["$SRC"] if not panel.endswith("_"): panel = "_".join(panel.split()) + "_" else: panel = "_".join(panel.split()) return sampleID, filesampleID
def test_hlog_on_fc_measurement(self): fc_measurement = FCMeasurement(ID='test', datafile=test_path) fc_measurement = fc_measurement.transform(transform='hlog', b=10) data = fc_measurement.data.values[:3, :4] correct_output = np.array([ [-8.22113965e+03, 1.20259949e+03, 1.01216449e-06, 5.21899170e+03], [-8.66184277e+03, 1.01013794e+03, 1.01216449e-06, 5.71275928e+03], [-8.79974414e+03, 1.52737976e+03, 1.01216449e-06, -4.95852930e+03] ]) np.testing.assert_array_almost_equal( data, correct_output, 5, err_msg='the hlog transformation gives ' 'an incorrect result')
def train(self): # Make file path datafile = os.path.join(SHARED_RAW_DIR, TRAIN_FCS_FILE) train_sample = FCMeasurement(ID='Test Sample', datafile=datafile) # Get DF self.train_df = train_sample.data # Preprocess self.train_df = self.pre_process(self.train_df) # Compute diagnosis self.compute_diagnosis() # Show head # Split self.split_train_test() # Make models self.models_array = self.models()
def load_fcs(self, filepath=None, parent=None): ax = self.ax if parent is None: parent = self.fig.canvas if filepath is None: from FlowCytometryTools.gui import dialogs filepath = dialogs.open_file_dialog('Select an FCS file to load', 'FCS files (*.fcs)|*.fcs', parent=parent) if filepath is not None: self.sample = FCMeasurement('temp', datafile=filepath) print('WARNING: Data is raw (not transformation).') self._sample_loaded_event()
def load_facs(file_name): """ Load .fcs data and converts into a pandas DataFrame Parameters ---------- file_name - path string to the .fjo file Returns ------- df- pandas DataFrame """ sample = FCMeasurement(ID='Test Sample', datafile=file_name) parameters = list(sample.channel_names) for par in range(len(parameters)): parameters[par] = parameters[par].encode('ascii', 'ignore') raw_data = sample.data.values # sample_id = sample.meta['SampleID'].encode('ascii','ignore') return raw_data, np.array(parameters)
def exptdf(self, exptdate, **kwargs): """ Return a dataframe holding all flow observations found according to the master_idx Optionally pass a master_index_df kwarg to avoid trying to automatically set a master_index_df """ if 'master_index_df' in kwargs: master_idx = kwargs['master_index_df'] else: master_idx = self.master_idx_by_date(exptdate) sampledfs = [] # Read in data and add identifying information # based on master index print(f'Found master index with {len(master_idx)} samples at') for idx in master_idx.index: row = master_idx.loc[idx, :] print(f'Looking for data at {row.filepath}') if os.path.exists(row.filepath): print(f'Found data') sampledf = FCMeasurement(ID=f'{row.strain}-{row.clone}', datafile=row.filepath).data print(f'Found {len(sampledf)} measurements in this file') # Annotate sample df for col in row.index: sampledf.loc[:, col] = row.loc[col] sampledfs.append(sampledf) else: print(f'No data found') if len(sampledfs) > 0: exptdf = pd.concat(sampledfs, ignore_index=True) else: exptdf = None print(f'No data found for exptdate {exptdate}') return exptdf
def __read_fcs_file_to_fcm(self, fcs_file_name): fcs_file = os.path.join(SHARED_RAW_DIR, fcs_file_name) if not os.path.exists(fcs_file): print('FCS file does not exist ', fcs_file) # return False fcs_file = os.path.join(SHARED_RAW_DIR, 'fcs_file.fcs') # running from cli # Load data tsample = FCMeasurement(ID='Test Sample', datafile=fcs_file) if self.transformation: tsample = tsample.transform(self.transformation, b=self.bins) self.channel_names = tsample.channel_names if not self.channel_name1 and not self.channel_name2: print('Check if channel names False', self.channel_names) self.channel_name1 = self.channel_names[0] self.channel_name2 = self.channel_names[1] else: self.channel_names = [self.channel_name1, self.channel_name2] self.sample = tsample # tsample.transform('hlog', channels=['Y2-A', 'B1-A', 'V2-A'], b=500.0)
def importF2(pathname, WellRow): """ Import FCS files. Variable input: name of path name to file. Output is a list of Data File Names in FCT Format Title/file names are returned in the array file --> later referenced in other functions as title/titles input argument """ # Declare arrays and int file = [] sample = [] z = 0 # Read in user input for file path and assign to array file pathlist = Path(r"" + str(pathname)).glob("**/*.fcs") for path in pathlist: wellID = path.name.split("_")[1] if wellID[0] == WellRow: file.append(str(path)) file.sort() assert file != [] # Go through each file and assign the file contents to entry in the array sample for entry in file: sample.append(FCMeasurement(ID="Test Sample" + str(z), datafile=entry)) z += 1 # Returns the array sample which contains data of each file in folder (one file per entry in array) return sample, file
def graph_cyto(gate, wells, output_file): directory=r'/Users/anazuniga/Documents/phyton/tc/' n=0 #plt.subplots(figsize=(4,2.4)) plt.subplots(figsize=(8,2)) #fig.set_size_inches(1, 1) plt.subplots_adjust(hspace=0.0, wspace= 0.0) inp=gate for x in wells: if inp[n]=='1': c='#00a651' else: c='#6D6E71' n+=1 datafile=directory+'/export_'+x+'_Single Cells.fcs' sample = FCMeasurement(ID='Test Sample', datafile=datafile) #density = stats.gaussian_kde(sample['BL1-H']) logbins = np.geomspace(10, 1000000, 100) ax=plt.subplot(1,16,n) #ax=plt.subplot(1,8,n) ax.hist(sample['GFP-H'], bins=logbins, orientation="horizontal", color='#00a651') plt.yscale('log') #ax.set_ylim(25, 750000) ax.set_ylim(10, 500000) ax.xaxis.set_ticks_position('none') ax.set_xticks([]) #ax.axhline(95000, color="black", linestyle='--', dashes=(4,4),lw=1) ax.axhline(200, color="black", linestyle='--', dashes=(4,4),lw=1) #ax.axhline(155000, color="black", linestyle='--', dashes=(4,4),lw=1) #ax.axhline(240, color="black", linestyle='--', dashes=(4,4),lw=1) #ax.xticks([]) #ax.xlabel('') #ax.set_ylabel("FI (A.U.)", fontsize=14) #plt.savefig("/Users/anazuniga/Documents/phyton/FlowCytometryTools/FCplate/D0 plots/"+output_file+".svg", format="svg") plt.savefig("/Users/anazuniga/Documents/phyton/tc/"+output_file+".pdf", format="pdf")
def analyze(datafile): sample = FCMeasurement(ID=datafile, datafile=datafile) hists = [] channels = [] bin_edges = [] for channel, data in zip(sample.channel_names, sample.data.values.T): if channel == 'Time': continue print("{}: Min: {}, Max: {}".format(channel, np.min(data), np.max(data))) # Convert scale if channel.startswith('FSC') or channel.startswith('SSC'): pass else: data[data <= 0] = 1 # For convenience until biexponential is implemented data = np.log10(data) # modify range if channel == 'FSC-W': data = data[data < 1.5e5] pass hist, bin_edge = np.histogram(data.flatten(), bins=100) hists.append(hist) channels.append(channel) bin_edges.append(bin_edge) fig = plt.figure() i = 0 for channel, hist, bin_edge in zip(channels, hists, bin_edges): i += 1 ax = fig.add_subplot(len(sample.channel_names) / 4, 4, i) ax.bar(bin_edge[:-1], hist / 1000, width=bin_edge[1:] - bin_edge[:-1]) ax.ticklabel_format(axis='x', style='sci', scilimits=(1, 4)) ax.set_title(channel) plt.show()
def rename_fcs(fcs_filelist, sample, md, datadir, gatingdir, tmpdir, logfile): well = sample.well plate_num = sample.plate result = find_files(fcs_filelist, plate_num, well, logfile) datafile = result['datafile'] gatefile = result['gatefile'] # [Pull out the metadata] ------------------------------------------------------------------------------- # Read in FCS data to get the metadata # TODO: check if datafile has > 1 arg. if len(datafile) == 0: # error should have already been recorded in the 'find_files' function print("datafile is empty") else: fcs_data = FCMeasurement(ID=well, datafile=tmpdir + datafile, readdata=False) fcs_date = pd.to_datetime(fcs_data.meta['$DATE']) fcs_date = fcs_date.strftime("%Y-%m-%d") # Create filenames fcs_filename = f'{sample.sample_id}-plate{plate_num}-{well}_sysserology-{sample.expt_id}_{fcs_date}.fcs' gate_filename = f'{sample.sample_id}-plate{plate_num}-{well}_sysserology-{sample.expt_id}_{fcs_date}_gates.xml' md = get_metadata(sample, fcs_data, fcs_date, fcs_filename, md) # [Rename the data files] ------------------------------------------------------------------------------- os.rename(tmpdir + datafile, f'{datadir}/{fcs_filename}') try: os.rename(tmpdir + gatefile, f'{gatingdir}/{gate_filename}') except: # error should have already been recorded in the 'find_files' function; no need to log again pass return md
import os from pylab import * import FlowCytometryTools from FlowCytometryTools import FCMeasurement # Locate sample data included with this package datadir = os.path.join(FlowCytometryTools.__path__[0], 'tests', 'data', 'Plate01') datafile = os.path.join(datadir, 'RFP_Well_A3.fcs') # datafile = '[insert path to your own fcs file]' # Load data tsample = FCMeasurement(ID='Test Sample', datafile=datafile) tsample = tsample.transform('hlog', channels=['Y2-A', 'B1-A', 'V2-A'], b=500.0) # Plot tsample.plot(['Y2-A', 'B1-A'], kind='scatter', alpha=0.6, color='gray') grid(True) #show() # <-- Uncomment when running as a script.
def fcs_to_pandas(fcs_folder, fcs_file): return pd.DataFrame(FCMeasurement(ID=fcs_file, datafile=fcs_folder+fcs_file)[:])
def load_fcs_data(filename): filepath = os.path.join(_DATA_DIR, filename) if filename.endswith('.pk'): return pickle.load(filepath) return FCMeasurement(ID=filename, datafile=filepath)
def extract(datafile): sample = FCMeasurement(ID='Test Sample', datafile=datafile) print(sample.channel_names) return (sample.data, sample.channel_names)
def __init__(self, fcs): self.prf = fcs msg('loading %s' % fcs) self.dat = FCMeasurement(ID=fcs, datafile=fcs) self.clusters = {} self.stats = {}
import seaborn as sb import pandas as pd import matplotlib.path as mplPath import matplotlib.pyplot as plt from FlowCytometryTools import FCMeasurement from roipoly import roipoly from scipy import stats sb.set() #%% # T98G filename = '/Users/xies/Box/Others/SZ_FACS_RB/SZ-091320 Async OPP_Group_T98G OPP647 2nd488.fcs' tg_bg = FCMeasurement(ID='Test Sample', datafile=filename) size_bins = np.linspace(tg_bg['FSC-A'].min(),tg_bg['FSC-A'].max(),num = 26) bg_rb = get_bin_means(tg_bg['FSC-A'],tg_bg['BL1-A'],size_bins) filename = '/Users/xies/Box/Others/SZ_FACS_RB/SZ-091320 Async OPP_Group_T98G OPP647 RB488.fcs' tg = FCMeasurement(ID='Test Sample', datafile=filename) dapi = tg['VL1-A'] fsc = tg['FSC-A'] plt.scatter(dapi,fsc,alpha=0.002) g1_gate = roipoly(color='r') g1_gate_p = mplPath.Path( np.vstack((g1_gate.x,g1_gate.y)).T ) g1_gateI = g1_gate_p.contains_points( np.vstack((tg['VL1-A'],tg['FSC-A'])).T ) tg_g1 = tg[g1_gateI]
def read_data(self): self.sample = FCMeasurement(ID='Test Sample', datafile=self.datafile)