Esempio n. 1
0
def apply_filter(csv, threshold, outdir, alpha=0.05):
    """Apply pydamage filtering

    Args:
        csv (str): path to pydamage result file
        threshold(float): Treshold value. 0 is for finding threshold with kneed method
        outdir (str): Path to output directory
        alpha (float, optional): Alpha q-value threshold. Defaults to 0.05.
    """

    df = read_csv(csv)
    outfile = "pydamage_filtered_results.csv"
    if threshold == 0:
        threshold = define_threshold(df)
        print(
            f"Optimal prediction accuracy threshold found to be: {threshold}")
    filt_df = filter_pydamage_results(df, acc_thresh=threshold)
    print(
        f"Filtering PyDamage results with qvalue <= {alpha} and predicted_accuracy >= {threshold}"
    )
    if not os.path.exists(outdir):
        os.mkdir(outdir)
    df_to_csv(filt_df, outdir, outfile)
    print(f"Filtered PyDamage results written to {outdir}/{outfile}")
    return filt_df
Esempio n. 2
0
def apply_filter(csv, outdir, alpha=0.05):
    """Apply pydamage filtering

    Args:
        csv (str): path to pydamage result file
        outdir (str): Path to output directory
        alpha (float, optional): Alpha q-value threshold. Defaults to 0.05.
    """

    df = read_csv(csv)
    outfile = "pydamage_filtered_results.csv"
    knee = find_knee(df)
    print(f"Optimal prediction accuracy threshold found to be: {knee}")
    filt_df = filter_pydamage_results(df, acc_thresh=knee)
    print(
        f"Filtering PyDamage results with qvalue <={alpha} and pred_accuracy >= {knee}"
    )
    df_to_csv(filt_df, outdir, outfile)
    print(f"Filtered PyDamage results written to {outdir}/{outfile}")
    return filt_df
Esempio n. 3
0
def analyze_multi(
    bam,
    wlen=30,
    show_al=False,
    process=1,
    outdir="",
    plot=False,
    verbose=False,
    force=False,
):
    """Runs the pydamage analysis for each reference separately

    Args:
        bam(str): Path to alignment (sam/bam/cram) file
        wlen(int): window length
        show_al(bool): print alignments representations
        process(int):  Number of  processes for parellel computing
        outdir(str): Path to output directory
        verbose(bool): verbose mode
        force(bool): force overwriting of results directory
    Returns:
        pd.DataFrame: pandas DataFrame containg pydamage results

    """
    if verbose:
        print(f"Pydamage version {__version__}\n")
    utils.makedir(outdir, force=force)

    if not verbose:
        warnings.filterwarnings("ignore")

    mode = utils.check_extension(bam)
    alf = pysam.AlignmentFile(bam, mode)

    if not alf.has_index():
        print(f"BAM file {bam} has no index. Sort BAM file and provide index "
              "before running pydamage.")
        sys.exit(1)

    refs = list(alf.references)

    if len(refs) == 0:
        print(f"No aligned sequences in {bam}")
        return []

    proc = min(len(refs), process)

    ##########################
    # Simple loop for debugging
    ##########################
    # filt_res = []
    # for ref in refs:
    #     res = damage.test_damage(bam=bam, ref=ref, wlen=wlen,
    #                              min_al=mini,  min_cov=cov, show_al=show_al,
    #                              mode=mode, process=process, verbose=verbose)
    #     if res:
    #         filt_res.append(res)
    ##########################
    ##########################

    test_damage_partial = partial(
        damage.test_damage,
        bam=bam,
        wlen=wlen,
        show_al=show_al,
        mode=mode,
        process=process,
        verbose=verbose,
    )
    print("Estimating and testing Damage")
    with multiprocessing.Pool(proc) as p:
        res = list(tqdm(p.imap(test_damage_partial, refs), total=len(refs)))
    filt_res = [i for i in res if i]

    print(f"{len(filt_res)} contigs were successfully analyzed by Pydamage")

    if plot and len(filt_res) > 0:
        print("\nGenerating Pydamage plots")
        plotdir = f"{outdir}/plots"
        utils.makedir(plotdir, confirm=False)

        plot_partial = partial(damageplot, outdir=plotdir)
        with multiprocessing.Pool(proc) as p:
            list(tqdm(p.imap(plot_partial, filt_res), total=len(filt_res)))

    df_pydamage = utils.pandas_processing(res_dict=filt_res)

    acc_model = load_model()
    prep_df_glm = prepare_data(df_pydamage)
    df_glm = fit_model(prep_df_glm, acc_model)

    df = df_glm.merge(df_pydamage, left_index=True, right_index=True)

    utils.df_to_csv(df, outdir)
    return df
Esempio n. 4
0
def analyze_group(
    bam,
    wlen=30,
    show_al=False,
    process=1,
    outdir="",
    plot=False,
    verbose=False,
    force=False,
):
    """Runs the pydamage analysis with all references grouped as one

    Args:
        bam(str): Path to alignment (sam/bam/cram) file
        wlen(int): window length
        show_al(bool): print alignments representations
        process(int):  Number of  processes for parellel computing
        outdir(str): Path to output directory
        verbose(bool): verbose mode
        force(bool): force overwriting of results directory
    Returns:
        pd.DataFrame: pandas DataFrame containg pydamage results
    """

    if verbose:
        print(f"Pydamage version {__version__}\n")
    utils.makedir(outdir, force=force)

    if not verbose:
        warnings.filterwarnings("ignore")

    mode = utils.check_extension(bam)
    alf = pysam.AlignmentFile(bam, mode)

    if not alf.has_index():
        print(f"BAM file {bam} has no index. Sort BAM file and provide index "
              "before running pydamage.")
        sys.exit(1)

    refs = list(alf.references)

    if len(refs) == 0:
        print(f"No aligned sequences in {bam}")
        return []

    proc = min(len(refs), process)

    get_damage_group_partial = partial(
        damage.get_damage_group,
        bam=bam,
        wlen=wlen,
        show_al=show_al,
        mode=mode,
        process=process,
    )
    print("Estimating and testing Damage")
    with multiprocessing.Pool(proc) as p:
        res = list(
            tqdm(p.imap(get_damage_group_partial, refs), total=len(refs)))
    ct_data = []
    ga_data = []
    cc_data = []
    all_bases = []
    cov = 0
    nb_ref = 0
    nb_reads_aligned = 0
    reflen = 0
    for i in res:
        ct_data += i[0]
        ga_data += i[1]
        cc_data += i[2]
        all_bases += i[5]
        cov += i[6]
        nb_ref += 1
        nb_reads_aligned += i[7]
        reflen += i[8]
    cov = cov / nb_ref

    damage_dict = damage.test_damage_group(
        ct_data,
        ga_data,
        cc_data,
        all_bases,
        nb_reads_aligned,
        cov,
        reflen,
        wlen,
        verbose,
    )

    if plot:
        print("\nGenerating Pydamage plot")
        plotdir = f"{outdir}/plots"
        utils.makedir(plotdir, confirm=False)
        damageplot(damage_dict, outdir=plotdir)

    df_pydamage = utils.pandas_group_processing(res_dict=damage_dict)

    acc_model = load_model()
    prep_df_glm = prepare_data(df_pydamage)
    df_glm = fit_model(prep_df_glm, acc_model)

    df = df_glm.merge(df_pydamage, left_index=True, right_index=True)

    utils.df_to_csv(df, outdir)
    return df
Esempio n. 5
0
def pydamage_analyze(
    bam,
    wlen=30,
    show_al=False,
    process=1,
    outdir="",
    plot=False,
    verbose=False,
    force=False,
    group=False,
):
    """Runs the pydamage analysis for each reference separately

    Args:
        bam(str): Path to alignment (sam/bam/cram) file
        wlen(int): window length
        show_al(bool): print alignments representations
        process(int):  Number of  processes for parellel computing
        outdir(str): Path to output directory
        verbose(bool): verbose mode
        force(bool): force overwriting of results directory
    Returns:
        pd.DataFrame: pandas DataFrame containg pydamage results

    """
    if verbose:
        print(f"Pydamage version {__version__}\n")
    utils.makedir(outdir, force=force)

    refs, mode = utils.prepare_bam(bam)

    proc = min(len(refs), process)

    ##########################
    # Simple loop for debugging
    ##########################
    # filt_res = []
    # for ref in refs:
    #     res = damage.test_damage(
    #         bam=bam,
    #         ref=ref,
    #         wlen=wlen,
    #         show_al=show_al,
    #         mode=mode,
    #         process=process,
    #         verbose=verbose,
    #     )
    #     if res:
    #         filt_res.append(res)
    #     break
    ##########################
    ##########################

    test_damage_partial = partial(
        damage.test_damage,
        bam=bam,
        mode=mode,
        wlen=wlen,
        show_al=show_al,
        process=process,
        verbose=verbose,
    )
    print("Estimating and testing Damage")
    if group:
        filt_res = [
            damage.test_damage(
                ref=None,
                bam=bam,
                mode=mode,
                wlen=wlen,
                show_al=show_al,
                process=process,
                verbose=verbose,
            )
        ]
    else:
        with multiprocessing.Pool(proc) as p:
            res = list(tqdm(p.imap(test_damage_partial, refs),
                            total=len(refs)))
        filt_res = [i for i in res if i]

    print(f"{len(filt_res)} contig(s) successfully analyzed by Pydamage")
    if len(filt_res) == 0:
        warnings.warn("No alignments were found, check your alignment file",
                      PyDamageWarning)

    if plot and len(filt_res) > 0:
        print("\nGenerating Pydamage plots")
        plotdir = f"{outdir}/plots"
        utils.makedir(plotdir, confirm=False)

        plot_partial = partial(damageplot, outdir=plotdir, wlen=wlen)
        with multiprocessing.Pool(proc) as p:
            list(tqdm(p.imap(plot_partial, filt_res), total=len(filt_res)))
    df_pydamage = utils.pandas_processing(res_dict=filt_res, wlen=wlen)

    acc_model = load_model()
    prep_df_glm = prepare_data(df_pydamage)
    df_glm = fit_model(prep_df_glm, acc_model)

    df = df_glm.merge(df_pydamage, left_index=True, right_index=True)

    utils.df_to_csv(df, outdir)
    return df