input_dir) #compare_from=inputfile #Keep only selected markers if filter_markers: selected_markers = read_marker_csv(input_dir) print(compare_to.columns) file_cols = compare_to.columns [x for x in file_cols if x[0].isdigit()] for x in file_cols: if x[0].isdigit(): if x not in selected_markers: compare_to = compare_to.drop(x, axis=1) print(compare_to.columns) # compare_to = downsample_data(compare_to, info_run, output_dir) #Customtest_1 compare_to_arc, marker_list = arcsinh_transf(cofactor, compare_to) #Leeave as default # compare_to_arc = compare_to #Customtest_2 # marker_list = [x for x in compare_to_arc.columns if x[0].isdigit()] #Customtest_2 print('Sample files:') print('\n'.join([f for f in input_files])) print(f'\nReference:\n{denominator}') print('\nMarkers:') print('\n'.join([m for m in marker_list])) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~Perform EMD~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# #Calculate EMD and save to the output folder (with denominator info run): # calculate emd and sign the emd score by the difference of median between compare_from and compare_to emd_df = pd.DataFrame() emd_infodict = {}
data = fcsparser.parse(file_path, meta_data_only=False)[1] reg_pnn = re.compile("(\d+Di$)") #Detect if, despite flag pnn_extracted = [] #columns match PnN pattern for n in data.columns.values.tolist(): if reg_pnn.search(n): pnn_extracted.append(n) if len(pnn_extracted) != 0: raise fcsparser.api.ParserFeatureNotImplementedError except fcsparser.api.ParserFeatureNotImplementedError: print("WARNING: Non-standard .fcs file detected: ", f) #use rpy2 to read the files and load into python data = read_rFCS(file_path)[0] if filter_markers: #Load .csv with the markers to use -> Often PTMs selected_markers = read_marker_csv(input_dir) data = data.loc[:, selected_markers] # Remove unwanted markers data_arc, markers = arcsinh_transf(cofactor, data) # generate the list of marker-marker pairs for dremi calculation marker_pairs = [comb for comb in list(permutations(markers, 2))] for marker_x, marker_y in marker_pairs: df_info_dict = {} df_info_dict["file_origin"] = filename df_info_dict["marker_x"] = marker_x df_info_dict["marker_y"] = marker_y df_info_dict["marker_x_marker_y"] = marker_x + '_' + marker_y df_info_dict["num_of_cells"] = data.shape[0] if plot == True: if os.path.isdir( f'{output_dir}/{info_run}/plots/x={marker_x}-y={marker_y}' ) == False:
###~Co-factor~### #Check if user wants to filter the markers based on a .csv marker file cofactor = 5 user_cofactor = yes_or_NO( "Using alpha=5 for the transformation. Would you like to change this value?" ) if user_cofactor: cofactor = int(input("Enter the new alpha to use (5=default): ")) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~Perform transformation~~~~~~~~~~~~~~~~~~~~~~~~~~# #Identify marker columns for input_file in filelist: markers = [] dataset = pd.read_csv(f"{input_dir}/{input_file}", sep='\t') print("Data read!") markers.append([x for x in dataset.columns if x[0].isdigit()]) print("Processed columns") for i in markers: print("Columns identified as markers: \n", i) print("Start transform (might take around 10min with larger datasets)") normalised_dataset = arcsinh_transf(cofactor, dataset)[0] print("Finished transform") print( "Start writing results to file (might take some time with larger datasets)" ) normalised_dataset.to_csv(f"{output_dir}/arcsinhTRANSF_{input_file}", index=False, sep='\t')
downs_inputs = yes_or_NO( "Multiple input files detected. Would you like to donwsample the number of cells?", default="YES") if downs_inputs: print("Downsampling taking place. Check output folder for more info") print(no_arc["file_origin"].value_counts()) no_arc = downsample_data(no_arc, info_run, output_dir) print(no_arc["file_origin"].value_counts()) else: print("Multiple input files; no downsampling") else: print("Only one input file detected; no downsampling") #Transformation# #Literature recommends cofactor of 5 for cytof data arc, cols = arcsinh_transf(cofactor, no_arc) #Storing marker columns for later use below #~~~~~~~~~~~~~~~Define the markers used for UMAP calculation~~~~~~~~~~~~~~~~~~# #Group columns of the dataframe based on the type of measurement not_markers_cols = [column for column in arc.columns if column not in cols] all_markers_cols = cols.copy() # define the v's for umap calculation (vs_markers_cols) not_these = [] # columns to be excluded for umap calculation vs_markers_cols = read_marker_csv( input_dir) #Read them from a .csv file in ./input print(vs_markers_cols) df_vs_markers_cols = pd.DataFrame(vs_markers_cols, columns=['marker']) df_vs_markers_cols.index = np.arange(1, len(df_vs_markers_cols) + 1)
input_dir) #compare_from=inputfile #Keep only selected markers if filter_markers: selected_markers = read_marker_csv(input_dir) print(compare_to.columns) file_cols = compare_to.columns [x for x in file_cols if x[0].isdigit()] for x in file_cols: if x[0].isdigit(): if x not in selected_markers: compare_to = compare_to.drop(x, axis=1) print(compare_to.columns) # compare_to = downsample_data(compare_to, info_run, output_dir) compare_to_arc, marker_list = arcsinh_transf(cofactor, compare_to) print('Sample files:') print('\n'.join([f for f in input_files])) print(f'\nReference:\n{denominator}') print('\nMarkers:') print('\n'.join([m for m in marker_list])) #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~Perform EMD~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~# #Calculate EMD and save to the output folder (with denominator info run): # calculate emd and sign the emd score by the difference of median between compare_from and compare_to emd_df = pd.DataFrame() emd_infodict = {} emd_infodict["denominator"] = denominator