def run(self, distribute: Optional[bool]=False, **kwargs): with general.configuration(self.step_local_staging_dir) as control: device = io.LocalStagingIO(control) df = device.load_step_manifest("loaddata") log.info(f"Manifest: {df.shape}") save_dir = self.step_local_staging_dir/"cell_features" save_dir.mkdir(parents=True, exist_ok=True) if distribute: distributor = cluster.FeaturesDistributor(control) distributor.set_data(df) distributor.distribute() log.info(f"Multiple jobs have been launched. Please come back when the calculation is complete.") return None calculator = FeatureCalculator(control) with concurrent.futures.ProcessPoolExecutor(control.get_ncores()) as executor: executor.map(calculator.execute, [row for _,row in df.iterrows()]) log.info(f"Loading results...") df_results = calculator.load_results_in_single_dataframe() df_results = df_results.set_index('CellId') log.info(f"Saving manifest...") self.manifest = df.merge(df_results, left_index=True, right_index=True) manifest_path = self.step_local_staging_dir / 'manifest.csv' self.manifest.to_csv(manifest_path) return
def run(self, distribute: Optional[bool] = False, **kwargs): with general.configuration(self.step_local_staging_dir) as control: device = io.LocalStagingIO(control) df = device.load_step_manifest("preprocessing") log.info(f"Manifest: {df.shape}") save_dir = self.step_local_staging_dir / "representations" save_dir.mkdir(parents=True, exist_ok=True) if distribute: distributor = cluster.ParameterizationDistributor(control) distributor.set_data(df) distributor.distribute() log.info( f"Multiple jobs have been launched. Please come back when the calculation is complete." ) return None parameterizer = Parameterizer(control) with concurrent.futures.ProcessPoolExecutor( control.get_ncores()) as executor: executor.map(parameterizer.execute, [row for _, row in df.iterrows()])
def run( self, filter = None, debug: bool=False, **kwargs ): with general.configuration(self.step_local_staging_dir) as control: device = io.LocalStagingIO(control) df = device.load_step_manifest("computefeatures") log.info(f"Shape of manifest: {df.shape}") if control.remove_mitotics(): if "cell_stage" not in df.columns: raise ValueError("Column cell_stage not found.") df = df.loc[df.cell_stage=='M0'] log.info(f"Manifest without mitotics: {df.shape}") if control.remove_outliers(): path_to_outliers_folder = self.step_local_staging_dir/"outliers" path_to_outliers_folder.mkdir(parents=True, exist_ok=True) path_to_df_outliers = self.step_local_staging_dir/"outliers.csv" if not path_to_df_outliers.is_file() or control.overwrite(): log.info("Computing outliers...") df_outliers = outliers_removal(df=df, output_dir=path_to_outliers_folder, log=log) df_outliers.to_csv(path_to_df_outliers) else: log.info("Using pre-detected outliers.") df_outliers = pd.read_csv(path_to_df_outliers, index_col='CellId') df_outliers = df_outliers.loc[df.index] df.loc[df_outliers.index, 'Outlier'] = df_outliers['Outlier'] df = df.loc[df.Outlier == 'No'] df = df.drop(columns=['Outlier']) log.info(f"Shape of data without outliers: {df.shape}") if control.is_filtering_on(): df = filtering(df, control) # Remove rows for which any feature is nan aliases = control.get_aliases_for_pca() columns = [f for f in df.columns if any(w in f for w in aliases)] columns = [c for c in columns if "transform" not in c] df_na = df.loc[df[columns].isna().any(axis=1)] if len(df_na): print(df_na.head()) print(f"{len(df_na)} rows found with NaN values.") df = df.loc[~df.index.isin(df_na.index)] log.info(f"Saving manifest...") self.manifest = df manifest_path = self.step_local_staging_dir/'manifest.csv' self.manifest.to_csv(manifest_path) return manifest_path
def run(self, distribute: Optional[bool] = False, **kwargs): with general.configuration(self.step_local_staging_dir) as control: for folder in ['values', 'plots']: save_dir = self.step_local_staging_dir / folder save_dir.mkdir(parents=True, exist_ok=True) device = io.LocalStagingIO(control) df = device.load_step_manifest("preprocessing") space = shapespace.ShapeSpace(control) space.execute(df) variables = control.get_variables_values_for_aggregation() variables = control.duplicate_variable(variables, "structure") df_agg = space.get_aggregated_df(variables, False) if distribute: distributor = cluster.ConcordanceDistributor(control) distributor.set_data(df_agg) distributor.distribute() log.info( f"Multiple jobs have been launched. Please come back when the calculation is complete." ) return None calculator = ConcordanceCalculator(control) with concurrent.futures.ProcessPoolExecutor( control.get_ncores()) as executor: executor.map(calculator.execute, [row for _, row in df_agg.iterrows()]) log.info(f"Loading results...") df_results = calculator.load_results_in_single_dataframe() log.info(f"Generating plots...") pmaker = plotting.ConcordancePlotMaker(control) pmaker.set_dataframe(df_results) for alias in tqdm(control.get_aliases_to_parameterize()): for shape_mode in control.get_shape_modes(): mpId = control.get_center_map_point_index() pmaker.filter_dataframe({ 'alias': alias, 'shape_mode': shape_mode, 'mpId': [mpId] }) pmaker.execute(display=False) mpIds = control.get_extreme_opposite_map_point_indexes() pmaker.filter_dataframe({ 'alias': alias, 'shape_mode': shape_mode, 'mpId': mpIds }) pmaker.execute(display=False)
def run(self, **kwargs): with general.configuration(self.step_local_staging_dir) as control: loader = DataLoader(control) df = loader.load(kwargs) self.manifest = df manifest_path = self.step_local_staging_dir / 'manifest.csv' self.manifest.to_csv(manifest_path) return manifest_path
def run(self, distribute: Optional[bool] = False, **kwargs): with general.configuration(self.step_local_staging_dir) as control: for folder in ['values', 'plots']: save_dir = self.step_local_staging_dir / folder save_dir.mkdir(parents=True, exist_ok=True) device = io.LocalStagingIO(control) df = device.load_step_manifest("preprocessing") space = shapespace.ShapeSpace(control) space.execute(df) variables = control.get_variables_values_for_aggregation() df_agg = space.get_aggregated_df(variables, True) if distribute: distributor = cluster.StereotypyDistributor(control) distributor.set_data(df_agg) distributor.distribute() log.info( f"Multiple jobs have been launched. Please come back when the calculation is complete." ) return None calculator = StereotypyCalculator(control) for _, row in tqdm(df_agg.iterrows(), total=len(df_agg)): '''Concurrent processes inside. Do not use concurrent here.''' calculator.execute(row) log.info(f"Loading results...") df_results = calculator.load_results_in_single_dataframe() log.info(f"Generating plots...") pmaker = plotting.StereotypyPlotMaker(control) pmaker.set_dataframe(df_results) for alias in tqdm(control.get_aliases_to_parameterize()): for shape_mode in control.get_shape_modes(): mpId = control.get_center_map_point_index() pmaker.filter_dataframe({ 'alias': alias, 'shape_mode': shape_mode, 'mpId': [mpId] }) pmaker.execute(display=False)
def run(self, debug: bool = False, **kwargs): with general.configuration(self.step_local_staging_dir) as control: device = io.LocalStagingIO(control) df = device.load_step_manifest("preprocessing") log.info(f"Manifest: {df.shape}") for folder in ["pca", "avgshape"]: save_dir = self.step_local_staging_dir / folder save_dir.mkdir(parents=True, exist_ok=True) calculator = ShapeModeCalculator(control) calculator.set_data(df) calculator.execute() return
def run(self, distribute: Optional[bool] = False, **kwargs): with general.configuration(self.step_local_staging_dir) as control: for folder in ["repsagg", "aggmorph"]: save_dir = self.step_local_staging_dir / folder save_dir.mkdir(parents=True, exist_ok=True) device = io.LocalStagingIO(control) df = device.load_step_manifest("preprocessing") space = shapespace.ShapeSpace(control) space.execute(df) variables = control.get_variables_values_for_aggregation() df_agg = space.get_aggregated_df(variables) if distribute: distributor = cluster.AggregationDistributor(control) distributor.set_data(df_agg) '''Setting chunk size to 1 here so that each job has to generate a single file. Otherwise Slurm crashes for reasons that I don't yet know. It seems to me that aggregation_tools.py is leaking memory. To be investigated.''' distributor.set_chunk_size(1) distributor.distribute() log.info( f"Multiple jobs have been launched. Please come back when the calculation is complete." ) return None aggregator = Aggregator(control) for index, row in tqdm(df_agg.iterrows(), total=len(df_agg)): '''Concurrent processes inside. Do not use concurrent here.''' aggregator.execute(row) return