Example #1
0
def isotope_label_detector(
    input_spec: InputSpec,
    source_dir: Path,
    exp_name: str,
    min_scans: int = 5,
    num_cond: int = 1,
    n_jobs: int = -1,
):
    scan_dir = source_dir.joinpath("all_scan_data")
    slope_dir = source_dir.joinpath("all_slope_data")
    slope_dir.mkdir(parents=True, exist_ok=True)
    out_dir = source_dir.joinpath("all_isotope_analysis")
    out_dir.mkdir(parents=True, exist_ok=True)
    features = utils.get_featurelist(source_dir=source_dir, exp_name=exp_name)
    conditions = input_spec.get_conditions()

    def run_label_detector(cond):
        out_file = slope_dir.joinpath(f"all_slope_data_{cond}.csv")
        print(f"Detecting labels in {cond}")

        #load both unlabeled and labeled scan data for cond
        scan_files = list(scan_dir.glob(f"all_ions_*{cond}.csv"))
        scan_dfs = []
        for s in scan_files:
            assert s.exists()
            s_df = pd.read_csv(s)
            scan_dfs.append(s_df)

        df = utils.combine_dfs(scan_dfs)

        #fil = scan_dir.joinpath(f"all_ions_{cond}.csv")
        #assert fil.exists()
        #df = pd.read_csv(fil)

        grouped = df.groupby(["exp_id", "isotope", "condition"])
        data = []
        for (e_id, iso, c), g in grouped:
            res = utils.calc_rep_stats(g, e_id, iso, c, min_scans=min_scans)
            # print(data)
            if len(res) < 1:
                continue
            data.extend(res)
        if data:
            agg_df = pd.DataFrame(data)
            res_df = utils.aggregate_results(agg_df)
            res_df.to_csv(out_file, index=False)
            utils.run_label_analysis(res_df, cond, out_dir)
        else:
            print(f"No labels to detect for {cond}")

    # Run processing of each condition in separate process
    joblib.Parallel(n_jobs=n_jobs)(joblib.delayed(run_label_detector)(c)
                                   for c in conditions)
    # [run_label_detector(c) for c in conditions]
    sum_df = utils.summarize_labels(out_dir, features, conditions)
    sum_df.to_csv(source_dir.joinpath(f"{exp_name}_data_summary.csv"))
    filtered_df = utils.filter_summary(sum_df, num_cond)
    filtered_df.to_csv(
        source_dir.joinpath(f"{exp_name}_data_summary_filtered.csv"))
Example #2
0
def generate_featurelist(
    input_spec: InputSpec,
    source_dir: Path,
    exp_name: str,
    config: Dict,
    n_jobs: int = -1,
    blank_remove: bool = True,
):
    """
    Create an mz ground truth list of all features detected in experiment and basketed
    to align between conditions and replicates. Optional blank subtraction.
    """
    print("Collecting feature list files")

    # define here so input_spec and config in scope and not needed as params
    def do_munge_featurelist(cond: str):
        out_dir = source_dir.joinpath(cond)
        out_dir.mkdir(parents=True, exist_ok=True)
        print(f"Working on {cond}")
        all_collapsed = utils.munge_featurelist(inp_spec=input_spec,
                                                cond=cond,
                                                out_dir=out_dir,
                                                config=config)
        return all_collapsed

    # peaks in blanks to subtract later
    # This should not break in the event that there are no blanks
    # but the printing may be misleading
    if input_spec.get_feature_filepaths("blank"):
        blanks = do_munge_featurelist("blank")
    else:
        print("No blanks found")
        blank_remove = False

    # Run pre-processing on conditions in separate processes
    conditions = input_spec.get_conditions()
    cond_dfs = joblib.Parallel(n_jobs=n_jobs)(
        joblib.delayed(do_munge_featurelist)(c) for c in conditions)

    all_cond_df = utils.combine_dfs(cond_dfs)
    if blank_remove:
        print("Substracting blanks")
        utils.blank_subtract(blanks, all_cond_df, config=config)
    else:
        print("Not subtracting blanks")
    print("Grouping all features")
    all_cond_df.reset_index(inplace=True, drop=True)
    dereplicator.group_features(
        all_cond_df, exp_name, "exp_id",
        config=config)  # final grouping - exp_id used for scan munging
    all_cond_df.to_csv(source_dir.joinpath(f"{exp_name}_all_features.csv"),
                       index=False)
    return all_cond_df