def run(config_path: Path, wt_dir: Path, mut_dir: Path, out_dir: Path, target_dir: Path, treatment_dir: Path = None, interaction_dir: Path = None, lines_to_process: Union[List, None] = None ): """ The entry point to the stats pipeline. Read in the stats_config, and iterate over the stats analysis methods and the mutant lines Parameters ---------- config_path The lama stats_config (in TOML format) wt_dir Root of the wild type data. Should contain mutant line subfolders mut_dir Root of the mutant data. Should contain mutant line subfolders out_dir The root output directory. Will be made if not existing target_dir Contains the population average, masks, label_maps and label infor files All Volumes should have been padded to the same size before registration. lines_to_process list: optional mutant line ids to process only. None: process all lines """ if not (wt_dir / 'output').is_dir(): raise FileNotFoundError(f'{wt_dir / "output"} folder with registration results is not present') if not (mut_dir / 'output').is_dir(): raise FileNotFoundError(f'{mut_dir / "output"} folder with registration results is not present') try: out_dir.mkdir(exist_ok=True) except FileNotFoundError: raise FileNotFoundError('Cannot create output folder') master_log_file = out_dir / f'{common.date_dhm()}_stats.log' logzero.logfile(str(master_log_file)) logging.info(common.git_log()) logging.info('### Started stats analysis ###}') stats_config = cfg_load(config_path) mask = load_mask(target_dir, stats_config['mask']) label_info_file = target_dir / stats_config.get('label_info') # What if not exists label_map_file = target_dir / stats_config.get('label_map') label_map = common.LoadImage(label_map_file).array memmap = stats_config.get('memmap') if memmap: logging.info('Memory mapping input data') baseline_file = stats_config.get('baseline_ids') if baseline_file: baseline_file = config_path.parent / baseline_file mutant_file = stats_config.get('mutant_ids') if mutant_file: mutant_file = config_path.parent / mutant_file # Run each data class through the pipeline. for stats_type in stats_config['stats_types']: logzero.logfile(str(master_log_file)) logging.info(f"---Doing {stats_type} analysis---") gc.collect() # load the required stats object and data loader loader_class = DataLoader.factory(stats_type) loader = loader_class(wt_dir, mut_dir, mask, stats_config, label_info_file, lines_to_process=lines_to_process, baseline_file=baseline_file, mutant_file=mutant_file, memmap=memmap, treatment_dir=treatment_dir, interaction_dir=interaction_dir) # Only affects organ vol loader. if not stats_config.get('normalise_organ_vol_to_mask'): loader.norm_to_mask_volume_on = False if loader_class == JacobianDataLoader: if stats_config.get('use_log_jacobians') is False: loader.data_folder_name = 'jacobians' # Currently only the intensity stats get normalised loader.normaliser = Normaliser.factory(stats_config.get('normalise'), stats_type) # move this into subclass logging.info("Start iterate through lines") common.logMemoryUsageInfo() #USe different iterator if using doing a two-way analysis if stats_config['two_way']: line_iterator = loader.two_way_iterator() line_input_data = None else: line_iterator = loader.line_iterator() line_input_data = None while True: try: line_input_data = next(line_iterator) logging.info(f"Data for line {line_input_data.line} loaded") common.logMemoryUsageInfo() line_id = line_input_data.line line_stats_out_dir = out_dir / line_id / stats_type line_stats_out_dir.mkdir(parents=True, exist_ok=True) line_log_file = line_stats_out_dir / f'{common.date_dhm()}_stats.log' logzero.logfile(str(line_log_file)) logging.info(f"Processing line: {line_id}") stats_class = Stats.factory(stats_type) stats_obj = stats_class(line_input_data, stats_type, stats_config.get('use_staging', True), stats_config.get('two_way', False)) stats_obj.stats_runner = linear_model.lm_r stats_obj.run_stats() logging.info('Statistical analysis finished.') common.logMemoryUsageInfo() logging.info('Writing results...') rw = ResultsWriter.factory(stats_type) writer = rw(stats_obj, mask, line_stats_out_dir, stats_type, label_map, label_info_file, stats_config.get('two_way', False)) logging.info('Finished writing results.') common.logMemoryUsageInfo() # # if stats_type == 'organ_volumes': # c_data = {spec: data['t'] for spec, data in stats_obj.specimen_results.items()} # c_df = pd.DataFrame.from_dict(c_data) # # cluster_plots.tsne_on_raw_data(c_df, line_stats_out_dir) if stats_config.get('invert_stats'): if writer.line_heatmap: # Organ vols wil not have this # How do I now sensibily get the path to the invert.yaml # get the invert_configs for each specimen in the line logging.info('Writing heatmaps...') logging.info('Propogating the heatmaps back onto the input images ') line_heatmap = writer.line_heatmap line_reg_dir = mut_dir / 'output' / line_id invert_heatmaps(line_heatmap, line_stats_out_dir, line_reg_dir, line_input_data) logging.info('Finished writing heatmaps.') logging.info(f"Finished processing line: {line_id} - All done") common.logMemoryUsageInfo() except StopIteration: if (line_input_data != None): logging.info(f"Finish iterate through lines") line_input_data.cleanup() common.logMemoryUsageInfo() break;
def run(wt_dir: Path, mut_dir: Path, out_dir: Path, num_perms: int, label_info: Path = None, label_map_path: Path = None, line_fdr: float=0.05, specimen_fdr: float=0.2, normalise_to_whole_embryo:bool=True): """ Run the permutation-based stats pipeline Parameters ---------- wt_dir Root of the wild type registration output This should contain an 'output' folder that contains a single baseline folder that contains multiple specimen folders mut_dir Root of the mutant registration output This should contain 'output' folder that contains multiple mutant lines folder, each containing one or more mutant specimen folders out_dir Where to store the intermediate results of the permutation testing num_perms number of permutations to do log_dependent if True, apply numpy.log to all the dependent values (organ volumes) label_info if supplied, use it to annotate the results with label names. Also can be used to filter certain labels from the analysis using the 'no_analysis' column line_fdr the FDR threshold at which to accept line level calls specimen_fdr the FDR threshold at which to accept specimen-level calls normalise_to_whole_embryo: Whether to divide the organ each organ volume by whole embryo volume """ # Collate all the staging and organ volume data into csvs np.random.seed(999) init_logging(out_dir / 'stats.log') logging.info(common.git_log()) logging.info(f'Running {__name__} with followng commands\n{common.command_line_agrs()}') wt_staging = get_staging_data(wt_dir) mut_staging = get_staging_data(mut_dir) wt_organ_vol = get_organ_volume_data(wt_dir) mut_organ_vol = get_organ_volume_data(mut_dir) data = prepare_data(wt_organ_vol, wt_staging, mut_organ_vol, mut_staging, label_meta=label_info, normalise_to_whole_embryo=normalise_to_whole_embryo) # Keep a record of the input data used in the analsysis data.to_csv(out_dir / 'input_data.csv') # Keep raw data for plotting raw_wt_vols = wt_organ_vol.copy() out_dir.mkdir(exist_ok=True, parents=True) # Root directory for output # make directory to store distributions and thresholds dists_out = out_dir / 'distributions' dists_out.mkdir(exist_ok=True) # Get the null distributions line_null, specimen_null, null_ids = distributions.null(data, num_perms) with open(dists_out / 'null_ids.yaml', 'w') as fh: yaml.dump(null_ids, fh) null_line_pvals_file = dists_out / 'null_line_dist_pvalues.csv' null_specimen_pvals_file = dists_out / 'null_specimen_dist_pvalues.csv' # Write the null distributions to file line_null.to_csv(null_line_pvals_file) specimen_null.to_csv(null_specimen_pvals_file) # Get the alternative distribution line_alt, spec_alt = distributions.alternative(data) line_alt_pvals_file = dists_out / 'alt_line_dist_pvalues.csv' spec_alt_pvals_file = dists_out / 'alt_specimen_dist_pvalues.csv' # Write the alternative distributions to file line_alt.to_csv(line_alt_pvals_file) spec_alt.to_csv(spec_alt_pvals_file) line_organ_thresholds = p_thresholds.get_thresholds(line_null, line_alt) specimen_organ_thresholds = p_thresholds.get_thresholds(specimen_null, spec_alt) line_thresholds_path = dists_out / 'line_organ_p_thresholds.csv' spec_thresholds_path = dists_out / 'specimen_organ_p_thresholds.csv' line_organ_thresholds.to_csv(line_thresholds_path) specimen_organ_thresholds.to_csv(spec_thresholds_path) logging.info('Annotating lines') lines_root_dir = out_dir / 'lines' lines_root_dir.mkdir(exist_ok=True) # Annotate lines logging.info(f"Annotating lines, using a FDR threshold of {line_fdr}") annotate(line_organ_thresholds, line_alt, lines_root_dir, label_info=label_info, label_map=label_map_path, write_thresholded_inv_labels=True,fdr_threshold=line_fdr) # Annotate specimens logging.info(f"Annotating specimens, using a FDR threshold of {specimen_fdr}") annotate(specimen_organ_thresholds, spec_alt, lines_root_dir, line_level=False, label_info=label_info, label_map=label_map_path, fdr_threshold=specimen_fdr) mut_dir_ = mut_dir / 'output' make_plots(mut_dir_, raw_wt_vols, wt_staging, label_info, lines_root_dir) dist_plot_root = out_dir / 'distribution_plots' line_plot_dir = dist_plot_root / 'line_level' line_plot_dir.mkdir(parents=True, exist_ok=True) pvalue_dist_plots(line_null, line_alt, line_organ_thresholds, line_plot_dir) specimen_plot_dir = dist_plot_root / 'specimen_level' specimen_plot_dir.mkdir(parents=True, exist_ok=True) pvalue_dist_plots(specimen_null, spec_alt.drop(columns=['line']), specimen_organ_thresholds, specimen_plot_dir)
def job_runner(config_path: Path) -> Path: """ Run the registrations specified in the config file Returns ------- The path to the final registrered images """ config = LamaConfig(config_path) print(common.git_log()) avg_dir = config.options['average_folder'] avg_dir.mkdir(exist_ok=True, parents=True) elastix_stage_parameters = generate_elx_parameters( config, do_pairwise=config['pairwise_registration']) # Set the fixed volume up for the first stage. This will checnge each stage if doing population average fixed_vol = config['fixed_volume'] # Get list of specimens inputs_dir = config.options['inputs'] spec_ids = [Path(x).stem for x in common.get_file_paths(inputs_dir)] for i, reg_stage in enumerate(config['registration_stage_params']): stage_id = reg_stage['stage_id'] logging.info(stage_id) stage_dir = Path(config.stage_dirs[stage_id]) # Make stage dir if not made by another instance of the script stage_dir.mkdir(exist_ok=True, parents=True) starting_avg = stage_dir / 'avg_started' average_done = stage_dir / "avg_done" while True: # Pick up unstarted speciemens. Only break when reg and avergae complete # Check if any specimens left (It's possible the avg is being made but all specimens are registered) spec_stage_dirs = [ x.name for x in stage_dir.iterdir() if x.is_dir() ] not_started = set(spec_ids).difference(spec_stage_dirs) next_stage = False # No breaking out yet if len(not_started) > 0: next_spec_id = list(not_started)[ 0] # Some specimens left. Pick up spec_id and process else: # All specimens are being processed next_stage = True # This block controls what happens if we have all speciemns registered while True: if not check_stage_done(stage_dir): print('waiting for stage to finish') time.sleep(5) continue print('stage finished') if average_done.is_file(): print('found average done file') break # Next stage else: if starting_avg.is_file(): print('found starting average file') time.sleep(5) continue else: try: open(starting_avg, 'x') except FileExistsError: time.sleep(5) print('cannot write avg starting file') continue else: average_path = avg_dir / f'{stage_id}.nrrd' make_avg(stage_dir, average_path, avg_dir / f'{stage_id}.log') open(average_done, 'x').close() print('making average') break if next_stage: print('breaking stage') break # Get the input for this specimen if i == 0: # The first stage moving = inputs_dir / f'{next_spec_id}.nrrd' else: moving = list(config.stage_dirs.values())[ i - 1] / next_spec_id / f'{next_spec_id}.nrrd' fixed_vol = avg_dir / f'{list(config.stage_dirs.keys())[i-1]}.nrrd' reg_method = TargetBasedRegistration # Make the elastix parameter file for this stage elxparam = elastix_stage_parameters[stage_id] elxparam_path = stage_dir / f'{ELX_PARAM_PREFIX}{stage_id}.txt' if not elxparam_path.is_file(): with open(elxparam_path, 'w') as fh: if elxparam: fh.write(elxparam) fixed_mask = None logging.info(moving) # Do the registrations registrator = reg_method(elxparam_path, moving, stage_dir, config['filetype'], config['threads'], fixed_mask) registrator.set_target(fixed_vol) try: registrator.run() # Do the registrations for a single stage except FileExistsError as e: # 040620: Bodge as some specimens are picked up twice. # Need a better way to make sure each speciemn picked up only once continue spec_done = stage_dir / next_spec_id / 'spec_done' # The directory gets created in .run() open(spec_done, 'x').close()
def run(configfile: Path): """ This is the main function Lama script for generating data from registering volumes It reads in the config file, creates directories, and initialises the registration process. Looks for paths to inputs relative the directory containing the config file Parameters ---------- param config A toml config file """ try: config = LamaConfig(configfile) except OSError as e: logging.error(f'Cannot open LAMA config file: {str(configfile)}\n{e}') raise except Exception as e: raise (LamaConfigError(e)) config.mkdir('output_dir') qc_dir = config.mkdir('qc_dir') config.mkdir('average_folder') config.mkdir('root_reg_dir') # TODO find the histogram batch code # if not config['no_qc']: # input_histogram_dir = config.mkdir('input_image_histograms') # make_histograms(config['inputs'], input_histogram_dir) logpath = config.config_path.parent / LOG_FILE # Make log in same directory as config file common.init_logging(logpath) if not common.test_installation('elastix'): raise OSError('Make sure elastix is installed') # Catch ctr-c signals so we can write that to logs # signal.signal(signal.SIGTERM, common.service_shutdown) signal.signal(signal.SIGINT, common.service_shutdown) mem_monitor = MonitorMemory(Path(config['output_dir']).absolute()) # Disable QC output? no_qc: bool = config['no_qc'] logging.info(common.git_log() ) # If running from a git repo, log the branch and commit # logging.info("Registration started") final_registration_dir = run_registration_schedule(config) make_deformations_at_different_scales(config) create_glcms(config, final_registration_dir) if config['skip_transform_inversion']: logging.info('Skipping inversion of transforms') else: logging.info('inverting transforms') batch_invert_transform_parameters(config) logging.info('inverting volumes') invert_volumes(config) if config['label_map']: generate_organ_volumes(config) if not generate_staging_data(config): logging.warning('No staging data generated') # Write out the names of the registration dirs in the order they were run with open(config['root_reg_dir'] / REG_DIR_ORDER, 'w') as fh: for reg_stage in config['registration_stage_params']: fh.write(f'{reg_stage["stage_id"]}\n') if not no_qc: if config['skip_transform_inversion']: inverted_label_overlay_dir = None else: inverted_label_overlay_dir = config.mkdir( 'inverted_label_overlay_dir') # registered_midslice_dir = config.mkdir('registered_midslice_dir') make_qc_images(config.config_dir, config['fixed_volume'], qc_dir) mem_monitor.stop() return True
def run(wt_dir: Path, mut_dir: Path, out_dir: Path, num_perms: int, label_info: Path = None, label_map_path: Path = None, line_fdr: float = 0.05, specimen_fdr: float = 0.2, normalise_to_whole_embryo: bool = True, qc_file: Path = None, voxel_size: float = 1.0): """ Run the permutation-based stats pipeline Parameters ---------- wt_dir Root of the wild type registration output This should contain an 'output' folder that contains a single baseline folder that contains multiple specimen folders mut_dir Root of the mutant registration output This should contain 'output' folder that contains multiple mutant lines folder, each containing one or more mutant specimen folders out_dir Where to store the intermediate results of the permutation testing num_perms number of permutations to do log_dependent if True, apply numpy.log to all the dependent values (organ volumes) label_info if supplied, use it to annotate the results with label names. Also can be used to filter certain labels from the analysis using the 'no_analysis' column line_fdr the FDR threshold at which to accept line level calls specimen_fdr the FDR threshold at which to accept specimen-level calls normalise_to_whole_embryo: Whether to divide the organ each organ volume by whole embryo volume qc_file csv indicating labels from specimens that should be excluded from the analysis columns: - id: the specimen id - line: the line id - label: the label to exclude (int) - label_name (optional) voxel_size For calcualting organ volumes """ # Collate all the staging and organ volume data into csvs np.random.seed(999) init_logging(out_dir / 'stats.log') logging.info(common.git_log()) logging.info( f'Running {__name__} with following commands\n{common.command_line_agrs()}' ) logging.info('Searching for staging data') wt_staging = get_staging_data(wt_dir) mut_staging = get_staging_data(mut_dir) logging.info('searching for organ volume data') wt_organ_vol = get_organ_volume_data(wt_dir) mut_organ_vol = get_organ_volume_data(mut_dir) # data # index: spec_id # cols: label_nums, with staging and line columns at the end data = prepare_data(wt_organ_vol, wt_staging, mut_organ_vol, mut_staging, label_meta=label_info, normalise_to_whole_embryo=normalise_to_whole_embryo, qc_file=qc_file) # Keep a record of the input data used in the analsysis data.to_csv(out_dir / 'input_data.csv') # Keep raw data for plotting # raw_wt_vols = wt_organ_vol.copy() # These includes QCd speciemns need to remove out_dir.mkdir(exist_ok=True, parents=True) # Root directory for output # make directory to store distributions and thresholds dists_out = out_dir / 'distributions' dists_out.mkdir(exist_ok=True) # Get the null distributions line_null, specimen_null = distributions.null(data, num_perms) # with open(dists_out / 'null_ids.yaml', 'w') as fh: # yaml.dump(null_ids, fh) null_line_pvals_file = dists_out / 'null_line_dist_pvalues.csv' null_specimen_pvals_file = dists_out / 'null_specimen_dist_pvalues.csv' # Write the null distributions to file line_null.to_csv(null_line_pvals_file) specimen_null.to_csv(null_specimen_pvals_file) # Get the alternative p-value distribution (and t-values now (2 and 3) line_alt, spec_alt, line_alt_t, spec_alt_t = distributions.alternative( data) line_alt_pvals_file = dists_out / 'alt_line_dist_pvalues.csv' spec_alt_pvals_file = dists_out / 'alt_specimen_dist_pvalues.csv' # Write the alternative distributions to file line_alt.to_csv(line_alt_pvals_file) spec_alt.to_csv(spec_alt_pvals_file) line_organ_thresholds = p_thresholds.get_thresholds(line_null, line_alt) specimen_organ_thresholds = p_thresholds.get_thresholds( specimen_null, spec_alt) line_thresholds_path = dists_out / 'line_organ_p_thresholds.csv' spec_thresholds_path = dists_out / 'specimen_organ_p_thresholds.csv' line_organ_thresholds.to_csv(line_thresholds_path) specimen_organ_thresholds.to_csv(spec_thresholds_path) logging.info('Annotating lines') lines_root_dir = out_dir / 'lines' lines_root_dir.mkdir(exist_ok=True) # Annotate lines logging.info(f"Annotating lines, using a FDR threshold of {line_fdr}") line_hits = annotate(line_organ_thresholds, line_alt, lines_root_dir, label_info=label_info, label_map=label_map_path, write_thresholded_inv_labels=True, fdr_threshold=line_fdr, t_values=line_alt_t, organ_volumes=data) line_hits.to_csv(out_dir / 'line_hits.csv') # Annotate specimens logging.info( f"Annotating specimens, using a FDR threshold of {specimen_fdr}") spec_hits = annotate(specimen_organ_thresholds, spec_alt, lines_root_dir, is_line_level=False, label_info=label_info, label_map=label_map_path, fdr_threshold=specimen_fdr, t_values=spec_alt_t, organ_volumes=data) spec_hits.to_csv(out_dir / 'specimen_level_hits.csv') # Make plots data_for_plots = data.copy() data_for_plots.columns = [x.strip('x') for x in data_for_plots.columns] # Strip any xs # If data has been normalised to WEV revert back for plots if normalise_to_whole_embryo: for col in data_for_plots.columns: if col.isdigit(): data_for_plots[ col] = data_for_plots[col] * data_for_plots['staging'] make_plots(mut_dir, data_for_plots, label_info, lines_root_dir, voxel_size=voxel_size) # Get specimen info. Currently just the WEV z-score to highlight specimens that are too small/large spec_info_file = out_dir / 'specimen_info.csv' write_specimen_info(wt_staging, mut_staging, spec_info_file) dist_plot_root = out_dir / 'distribution_plots' line_plot_dir = dist_plot_root / 'line_level' line_plot_dir.mkdir(parents=True, exist_ok=True) pvalue_dist_plots(line_null, line_alt, line_organ_thresholds, line_plot_dir) specimen_plot_dir = dist_plot_root / 'specimen_level' specimen_plot_dir.mkdir(parents=True, exist_ok=True) pvalue_dist_plots(specimen_null, spec_alt.drop(columns=['line']), specimen_organ_thresholds, specimen_plot_dir) heatmaps_for_permutation_stats(lines_root_dir)
def run(configfile: Path): """ This is the main function Lama script for generating data from registering volumes It reads in the config file, creates directories, and initialises the registration process. Looks for paths to inputs relative the directory containing the config file Parameters ---------- param config A toml config file """ try: config = LamaConfig(configfile) except OSError as e: logging.error(f'Cannot open LAMA config file: {str(configfile)}\n{e}') raise except Exception as e: raise (LamaConfigError(e)) config.mkdir('output_dir') qc_dir = config.mkdir('qc_dir') config.mkdir('average_folder') config.mkdir('root_reg_dir') # TODO find the histogram batch code # if not config['no_qc']: # input_histogram_dir = config.mkdir('input_image_histograms') # make_histograms(config['inputs'], input_histogram_dir) logpath = config.config_path.parent / LOG_FILE # Make log in same directory as config file common.init_logging(logpath) if not common.test_installation('elastix'): raise OSError('Make sure elastix is installed') # Catch ctr-c signals so we can write that to logs # signal.signal(signal.SIGTERM, common.service_shutdown) signal.signal(signal.SIGINT, common.service_shutdown) mem_monitor = MonitorMemory(Path(config['output_dir']).absolute()) # Disable QC output? no_qc: bool = config['no_qc'] logging.info(common.git_log() ) # If running from a git repo, log the branch and commit # logging.info("Registration started") first_stage_only = config['skip_forward_registration'] # If we only want the reverse label propagation we just need the initial rigid registration to act as the # Fixed image for the moving populaiton average final_registration_dir = run_registration_schedule( config, first_stage_only=first_stage_only) if not first_stage_only: neg_jac = make_deformations_at_different_scales(config) folding_report(neg_jac, config['output_dir'], config['label_info'], outdir=config['output_dir']) create_glcms(config, final_registration_dir) # Write out the names of the registration dirs in the order they were run with open(config['root_reg_dir'] / REG_DIR_ORDER_CFG, 'w') as fh: for reg_stage in config['registration_stage_params']: fh.write(f'{reg_stage["stage_id"]}\n') if first_stage_only: break if config['skip_transform_inversion']: logging.info('Skipping inversion of transforms') else: logging.info('inverting transforms') if config['label_propagation'] == 'reverse_registration': reverse_registration(config) else: # invert_transform method is the default batch_invert_transform_parameters(config) logging.info('propagating volumes') invert_volumes(config) # Now that labels have been inverted, should we delete the transorm files? if config['delete_inverted_transforms']: shutil.rmtree(config['output_dir'] / 'inverted_transforms') if config['label_map']: generate_organ_volumes(config) if config['seg_plugin_dir']: plugin_interface.secondary_segmentation(config) if not generate_staging_data(config): logging.warning('No staging data generated') if not no_qc: rev_reg = True if config[ 'label_propagation'] == 'reverse_registration' else False make_qc_images(config.config_dir, config['fixed_volume'], qc_dir, mask=None, reverse_reg_propagation=rev_reg) mem_monitor.stop() return True