コード例 #1
0
    def run(self, distribute: Optional[bool]=False, **kwargs):

        with general.configuration(self.step_local_staging_dir) as control:

            device = io.LocalStagingIO(control)
            df = device.load_step_manifest("loaddata")
            log.info(f"Manifest: {df.shape}")

            save_dir = self.step_local_staging_dir/"cell_features"
            save_dir.mkdir(parents=True, exist_ok=True)

            if distribute:

                distributor = cluster.FeaturesDistributor(control)
                distributor.set_data(df)
                distributor.distribute()
                log.info(f"Multiple jobs have been launched. Please come back when the calculation is complete.")

                return None

            calculator = FeatureCalculator(control)
            with concurrent.futures.ProcessPoolExecutor(control.get_ncores()) as executor:
                executor.map(calculator.execute, [row for _,row in df.iterrows()])

            log.info(f"Loading results...")
            df_results = calculator.load_results_in_single_dataframe()
            df_results = df_results.set_index('CellId')

            log.info(f"Saving manifest...")
            self.manifest = df.merge(df_results, left_index=True, right_index=True)
            manifest_path = self.step_local_staging_dir / 'manifest.csv'
            self.manifest.to_csv(manifest_path)

        return
コード例 #2
0
    def run(self, distribute: Optional[bool] = False, **kwargs):

        with general.configuration(self.step_local_staging_dir) as control:

            device = io.LocalStagingIO(control)
            df = device.load_step_manifest("preprocessing")
            log.info(f"Manifest: {df.shape}")

            save_dir = self.step_local_staging_dir / "representations"
            save_dir.mkdir(parents=True, exist_ok=True)

            if distribute:

                distributor = cluster.ParameterizationDistributor(control)
                distributor.set_data(df)
                distributor.distribute()
                log.info(
                    f"Multiple jobs have been launched. Please come back when the calculation is complete."
                )

                return None

            parameterizer = Parameterizer(control)
            with concurrent.futures.ProcessPoolExecutor(
                    control.get_ncores()) as executor:
                executor.map(parameterizer.execute,
                             [row for _, row in df.iterrows()])
コード例 #3
0
    def run(
        self,
        filter = None,
        debug: bool=False,
        **kwargs
        ):
        
        with general.configuration(self.step_local_staging_dir) as control:
            
            device = io.LocalStagingIO(control)
            df = device.load_step_manifest("computefeatures")
            log.info(f"Shape of manifest: {df.shape}")
            
            if control.remove_mitotics():

                if "cell_stage" not in df.columns:
                    raise ValueError("Column cell_stage not found.")
                df = df.loc[df.cell_stage=='M0']
                log.info(f"Manifest without mitotics: {df.shape}")
        
            if control.remove_outliers():

                path_to_outliers_folder = self.step_local_staging_dir/"outliers"
                path_to_outliers_folder.mkdir(parents=True, exist_ok=True)
                
                path_to_df_outliers = self.step_local_staging_dir/"outliers.csv"
                if not path_to_df_outliers.is_file() or control.overwrite():
                    log.info("Computing outliers...")
                    df_outliers = outliers_removal(df=df, output_dir=path_to_outliers_folder, log=log)
                    df_outliers.to_csv(path_to_df_outliers)
                else:
                    log.info("Using pre-detected outliers.")
                    df_outliers = pd.read_csv(path_to_df_outliers, index_col='CellId')

                df_outliers = df_outliers.loc[df.index]
                df.loc[df_outliers.index, 'Outlier'] = df_outliers['Outlier']
                df = df.loc[df.Outlier == 'No']
                df = df.drop(columns=['Outlier'])
                log.info(f"Shape of data without outliers: {df.shape}")
            
            if control.is_filtering_on():

                df = filtering(df, control)

            # Remove rows for which any feature is nan
            aliases = control.get_aliases_for_pca()
            columns = [f for f in df.columns if any(w in f for w in aliases)]
            columns = [c for c in columns if "transform" not in c]
            df_na = df.loc[df[columns].isna().any(axis=1)]
            if len(df_na):
                print(df_na.head())
                print(f"{len(df_na)} rows found with NaN values.")
                df = df.loc[~df.index.isin(df_na.index)]

            log.info(f"Saving manifest...")
            self.manifest = df
            manifest_path = self.step_local_staging_dir/'manifest.csv'
            self.manifest.to_csv(manifest_path)

            return manifest_path
コード例 #4
0
    def run(self, distribute: Optional[bool] = False, **kwargs):

        with general.configuration(self.step_local_staging_dir) as control:

            for folder in ['values', 'plots']:
                save_dir = self.step_local_staging_dir / folder
                save_dir.mkdir(parents=True, exist_ok=True)

            device = io.LocalStagingIO(control)
            df = device.load_step_manifest("preprocessing")
            space = shapespace.ShapeSpace(control)
            space.execute(df)
            variables = control.get_variables_values_for_aggregation()
            variables = control.duplicate_variable(variables, "structure")
            df_agg = space.get_aggregated_df(variables, False)

            if distribute:

                distributor = cluster.ConcordanceDistributor(control)
                distributor.set_data(df_agg)
                distributor.distribute()
                log.info(
                    f"Multiple jobs have been launched. Please come back when the calculation is complete."
                )

                return None

            calculator = ConcordanceCalculator(control)
            with concurrent.futures.ProcessPoolExecutor(
                    control.get_ncores()) as executor:
                executor.map(calculator.execute,
                             [row for _, row in df_agg.iterrows()])

            log.info(f"Loading results...")

            df_results = calculator.load_results_in_single_dataframe()

            log.info(f"Generating plots...")

            pmaker = plotting.ConcordancePlotMaker(control)
            pmaker.set_dataframe(df_results)
            for alias in tqdm(control.get_aliases_to_parameterize()):
                for shape_mode in control.get_shape_modes():
                    mpId = control.get_center_map_point_index()
                    pmaker.filter_dataframe({
                        'alias': alias,
                        'shape_mode': shape_mode,
                        'mpId': [mpId]
                    })
                    pmaker.execute(display=False)
                    mpIds = control.get_extreme_opposite_map_point_indexes()
                    pmaker.filter_dataframe({
                        'alias': alias,
                        'shape_mode': shape_mode,
                        'mpId': mpIds
                    })
                    pmaker.execute(display=False)
コード例 #5
0
ファイル: load_data.py プロジェクト: colobas/cvapipe_analysis
    def run(self, **kwargs):

        with general.configuration(self.step_local_staging_dir) as control:

            loader = DataLoader(control)
            df = loader.load(kwargs)

            self.manifest = df
            manifest_path = self.step_local_staging_dir / 'manifest.csv'
            self.manifest.to_csv(manifest_path)

        return manifest_path
コード例 #6
0
    def run(self, distribute: Optional[bool] = False, **kwargs):

        with general.configuration(self.step_local_staging_dir) as control:

            for folder in ['values', 'plots']:
                save_dir = self.step_local_staging_dir / folder
                save_dir.mkdir(parents=True, exist_ok=True)

            device = io.LocalStagingIO(control)
            df = device.load_step_manifest("preprocessing")
            space = shapespace.ShapeSpace(control)
            space.execute(df)
            variables = control.get_variables_values_for_aggregation()
            df_agg = space.get_aggregated_df(variables, True)

            if distribute:

                distributor = cluster.StereotypyDistributor(control)
                distributor.set_data(df_agg)
                distributor.distribute()
                log.info(
                    f"Multiple jobs have been launched. Please come back when the calculation is complete."
                )

                return None

            calculator = StereotypyCalculator(control)
            for _, row in tqdm(df_agg.iterrows(), total=len(df_agg)):
                '''Concurrent processes inside. Do not use concurrent here.'''
                calculator.execute(row)

            log.info(f"Loading results...")

            df_results = calculator.load_results_in_single_dataframe()

            log.info(f"Generating plots...")

            pmaker = plotting.StereotypyPlotMaker(control)
            pmaker.set_dataframe(df_results)
            for alias in tqdm(control.get_aliases_to_parameterize()):
                for shape_mode in control.get_shape_modes():
                    mpId = control.get_center_map_point_index()
                    pmaker.filter_dataframe({
                        'alias': alias,
                        'shape_mode': shape_mode,
                        'mpId': [mpId]
                    })
                    pmaker.execute(display=False)
コード例 #7
0
    def run(self, debug: bool = False, **kwargs):

        with general.configuration(self.step_local_staging_dir) as control:

            device = io.LocalStagingIO(control)
            df = device.load_step_manifest("preprocessing")
            log.info(f"Manifest: {df.shape}")

            for folder in ["pca", "avgshape"]:
                save_dir = self.step_local_staging_dir / folder
                save_dir.mkdir(parents=True, exist_ok=True)

            calculator = ShapeModeCalculator(control)
            calculator.set_data(df)
            calculator.execute()

        return
コード例 #8
0
    def run(self, distribute: Optional[bool] = False, **kwargs):

        with general.configuration(self.step_local_staging_dir) as control:

            for folder in ["repsagg", "aggmorph"]:
                save_dir = self.step_local_staging_dir / folder
                save_dir.mkdir(parents=True, exist_ok=True)

            device = io.LocalStagingIO(control)
            df = device.load_step_manifest("preprocessing")
            space = shapespace.ShapeSpace(control)
            space.execute(df)
            variables = control.get_variables_values_for_aggregation()
            df_agg = space.get_aggregated_df(variables)

            if distribute:

                distributor = cluster.AggregationDistributor(control)
                distributor.set_data(df_agg)
                '''Setting chunk size to 1 here so that each job has to generate
                a single file. Otherwise Slurm crashes for reasons that I don't
                yet know. It seems to me that aggregation_tools.py is leaking
                memory. To be investigated.'''
                distributor.set_chunk_size(1)
                distributor.distribute()
                log.info(
                    f"Multiple jobs have been launched. Please come back when the calculation is complete."
                )

                return None

            aggregator = Aggregator(control)
            for index, row in tqdm(df_agg.iterrows(), total=len(df_agg)):
                '''Concurrent processes inside. Do not use concurrent here.'''
                aggregator.execute(row)

        return