コード例 #1
0
def cross_validation_inmode(inmode_model, peaks_path, counter, length,
                            path_to_inmode, path_to_java, tmp_dir, output_dir,
                            pfpr):
    if not os.path.exists(tmp_dir):
        os.mkdir(tmp_dir)
    true_scores = []
    false_scores = []
    peaks = read_peaks(peaks_path)
    shuffled_peaks = creat_background(peaks, length, counter)
    write_fasta(shuffled_peaks, tmp_dir, "shuffled")
    write_fasta(peaks, tmp_dir, "train")
    for true_score in true_scores_inmode(inmode_model, path_to_inmode,
                                         path_to_java, tmp_dir, "train",
                                         'current'):
        true_scores.append(true_score)
    for false_score in false_scores_inmode(inmode_model, path_to_inmode,
                                           path_to_java, tmp_dir, "shuffled",
                                           'current'):
        false_scores.append(false_score)
    fprs = calculate_fprs(true_scores, false_scores)
    roc = calculate_short_roc(fprs, step=1)
    auc = calculate_particial_auc(roc['TPR'], roc['FPR'], pfpr)
    write_auc(output_dir + '/inmode_auc.txt', auc)
    write_roc(output_dir + "/inmode_cv.txt", roc)
    shutil.rmtree(tmp_dir)
    return (0)
コード例 #2
0
def de_novo_with_oprimization_strum(peaks_path, backgroud_path, tmp_dir,
                                    output_dir, output_auc, cpu_count, pfpr):
    counter = 5000000
    if not os.path.exists(tmp_dir):
        os.mkdir(tmp_dir)
    if not os.path.isdir(output_auc):
        os.mkdir(output_auc)
    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)
    learn_optimized_strum(peaks_path, backgroud_path, counter, tmp_dir,
                          output_auc, cpu_count, pfpr)
    length = choose_best_model(output_auc)
    copyfile(output_auc + '/training_bootstrap_{}.txt'.format(length),
             output_dir + '/bootstrap.txt')
    copyfile(output_auc + '/training_bootstrap_merged_{}.txt'.format(length),
             output_dir + '/bootstrap_merged.txt')
    strum_model = strum_de_novo(peaks_path, length, cpu_count)
    peaks = read_peaks(peaks_path)
    true_scores, sites = true_scores_strum(peaks, strum_model, length)
    sites = [i for i in sites if not 'N' in i]
    pcm = make_pcm(sites)
    pfm = make_pfm(pcm)
    nsites = len(sites)
    background = {'A': 0.25, 'C': 0.25, 'G': 0.25, 'T': 0.25}
    tag = 'strum_model'
    write_meme(output_dir, tag, pfm, background, nsites)
    write_strum(strum_model, output_dir + '/{}.pickle'.format(tag))
    return (length)
コード例 #3
0
def learn_optimized_bamm_support(peaks_path, backgroud_path, counter, order, length, pwm_auc_dir, tmp_dir, output_dir, pfpr):
    true_scores = []
    false_scores = []
    peaks = read_peaks(peaks_path)
    for step in ['odd', 'even']:
        meme = pwm_auc_dir + '/pwm_model_{0}_{1}.meme'.format(step, length)
        if step == 'odd':
            train_peaks = [p for index, p in enumerate(peaks, 1) if index % 2 != 0]
            test_peaks = [p for index, p in enumerate(peaks, 1) if index % 2 == 0]
        else:
            train_peaks = [p for index, p in enumerate(peaks, 1) if index % 2 == 0]
            test_peaks = [p for index, p in enumerate(peaks, 1) if index % 2 != 0]                
        write_fasta(train_peaks, tmp_dir + '/train.fasta')
        if os.path.isfile(backgroud_path):
            shuffled_peaks = read_peaks(backgroud_path)
            bamm, order = create_bamm_model(tmp_dir + '/train.fasta', backgroud_path, tmp_dir, order, meme, 0, length)
        else:
            shuffled_peaks = creat_background(test_peaks, length, counter)
            write_fasta(shuffled_peaks, tmp_dir + '/background.fasta')
            bamm, order = create_bamm_model(tmp_dir + '/train.fasta', tmp_dir + '/background.fasta', tmp_dir, order, meme, 0, length)
        for true_score in true_scores_bamm(test_peaks, bamm, order, length):
            true_scores.append(true_score)
        for false_score in false_scores_bamm(shuffled_peaks, bamm, order, length):
            false_scores.append(false_score)
        shutil.copy(tmp_dir + '/{}_motif_1.ihbcp'.format(length),
               output_dir + '/bamm_model_{0}_{1}_{2}.ihbcp'.format(step, order, length))
        shutil.copy(tmp_dir + '/{}.hbcp'.format(length),
               output_dir + '/bamm_{0}_{1}_{2}.hbcp'.format(step, order, length))
    fprs = calculate_fprs(true_scores, false_scores)
    roc = calculate_short_roc(fprs, step=1)
    merged_roc = calculate_merged_roc(fprs)
    auc = calculate_particial_auc(merged_roc['TPR'], merged_roc['FPR'], pfpr)
    print("Length {0}; Order {1}".format(length, order), "pAUC at {0} = {1};".format(pfpr, auc))
    write_auc_with_order(output_dir + '/auc.txt', auc, length, order)
    write_roc(output_dir + "/training_bootstrap_{0}_{1}.txt".format(length, order), roc)
    write_roc(output_dir + "/training_bootstrap_merged_{0}_{1}.txt".format(length, order), merged_roc)
    return(0)
コード例 #4
0
def cross_validation_pwm(pwm, length, peaks_path, counter, output_dir, pfpr):
    true_scores = []
    false_scores = []
    peaks = read_peaks(peaks_path)
    shuffled_peaks = creat_background(peaks, length, counter)
    for true_score in true_scores_pwm(peaks, pwm, length):
        true_scores.append(true_score)
    for false_score in false_scores_pwm(shuffled_peaks, pwm, length):
        false_scores.append(false_score)
    fprs = calculate_fprs(true_scores, false_scores)
    roc = calculate_short_roc(fprs, step=1)
    auc = calculate_particial_auc(roc['TPR'], roc['FPR'], pfpr)
    write_auc(output_dir + '/pwm_auc.txt', auc)
    write_roc(output_dir + "/pwm_cv.txt", roc)
    return (0)
コード例 #5
0
def de_novo_with_oprimization_bamm(peaks_path, backgroud_path, pwm_auc_dir, tmp_dir, 
    output_dir, output_auc, pfpr):
    if not os.path.exists(tmp_dir):
        os.mkdir(tmp_dir)
    else:
        shutil.rmtree(tmp_dir)
        os.mkdir(tmp_dir)
    if not os.path.isdir(output_dir):
        os.mkdir(output_dir)
    if not os.path.isdir(output_auc):
        os.mkdir(output_auc)
    counter = 1000000
    learn_optimized_bamm(peaks_path, backgroud_path, counter, pwm_auc_dir, tmp_dir, output_auc, pfpr)
    length, order = choose_best_model(output_auc)
    meme = pwm_auc_dir + '/pwm_model_even_{}.meme'.format(length)
    if os.path.isfile(backgroud_path):
        run_streme(peaks_path, backgroud_path, tmp_dir, length)
        pfm, background_pfm, length_pfm, nsites = parse_streme(tmp_dir + '/streme.txt')
        tag = 'pfm_model'
        write_meme(tmp_dir, tag, pfm, background_pfm, nsites)
        meme = tmp_dir + '/pfm_model.meme'
        create_bamm_model(peaks_path, backgroud_path, tmp_dir, order, meme, 0, length)
    else:
        peaks = read_peaks(peaks_path)
        shuffled_peaks = creat_background(peaks, length, counter)
        write_fasta(shuffled_peaks, tmp_dir + '/background.fasta')
        run_streme_hmm_background(peaks_path, tmp_dir, length)
        pfm, background_pfm, length_pfm, nsites = parse_streme(tmp_dir + '/streme.txt')
        tag = 'pfm_model'
        write_meme(tmp_dir, tag, pfm, background_pfm, nsites)
        meme = tmp_dir + '/pfm_model.meme'
        create_bamm_model(peaks_path, tmp_dir + '/background.fasta', tmp_dir, order, meme, 0, length)
    shutil.copy(tmp_dir + '/pfm_model.meme',
           output_dir + '/pfm_model.meme')
    shutil.copy(tmp_dir + '/{}_motif_1.ihbcp'.format(length),
           output_dir + '/bamm_model.ihbcp')
    shutil.copy(tmp_dir + '/{}.hbcp'.format(length),
           output_dir + '/bamm.hbcp')
    shutil.copy(output_auc + '/training_bootstrap_{0}_{1}.txt'.format(length, order), 
             output_dir + '/bootstrap.txt')
    shutil.copy(output_auc + '/training_bootstrap_merged_{0}_{1}.txt'.format(length, order), 
             output_dir + '/bootstrap_merged.txt')
    shutil.rmtree(tmp_dir)
    return(length, order)
コード例 #6
0
def learn_optimized_pwm(peaks_path, backgroud_path, counter, tmp_dir,
                        output_auc, pfpr):
    if not os.path.exists(tmp_dir):
        os.mkdir(tmp_dir)
    if not os.path.isdir(output_auc):
        os.mkdir(output_auc)
    if os.path.exists(output_auc + '/auc.txt'):
        os.remove(output_auc + '/auc.txt')
    #for length in range(12, 41, 4):
    for length in range(8, 31, 4):
        true_scores = []
        false_scores = []
        peaks = read_peaks(peaks_path)
        for step in ['odd', 'even']:
            if step == 'odd':
                train_peaks = [
                    p for index, p in enumerate(peaks, 1) if index % 2 != 0
                ]
                test_peaks = [
                    p for index, p in enumerate(peaks, 1) if index % 2 == 0
                ]
            else:
                train_peaks = [
                    p for index, p in enumerate(peaks, 1) if index % 2 == 0
                ]
                test_peaks = [
                    p for index, p in enumerate(peaks, 1) if index % 2 != 0
                ]
            write_fasta(train_peaks, tmp_dir + '/train.fasta')
            if os.path.isfile(backgroud_path):
                shuffled_peaks = read_peaks(backgroud_path)
                run_streme(tmp_dir + '/train.fasta', backgroud_path, tmp_dir,
                           length)
            else:
                shuffled_peaks = creat_background(test_peaks, length, counter)
                run_streme_hmm_background(tmp_dir + '/train.fasta', tmp_dir,
                                          length)
            pfm, background, length, nsites = parse_streme(tmp_dir +
                                                           '/streme.txt')
            pwm = make_pwm(pfm)
            for true_score in true_scores_pwm(test_peaks, pwm, length):
                true_scores.append(true_score)
            for false_score in false_scores_pwm(shuffled_peaks, pwm, length):
                false_scores.append(false_score)
            tag = 'pwm_model_{0}_{1}'.format(step, length)
            write_meme(output_auc, tag, pfm, background, nsites)
            write_pwm(output_auc, tag, pwm)
            write_pfm(output_auc, tag, pfm)
        fprs = calculate_fprs(true_scores, false_scores)
        roc = calculate_short_roc(fprs, step=1)
        merged_roc = calculate_merged_roc(fprs)
        auc = calculate_particial_auc(merged_roc['TPR'], merged_roc['FPR'],
                                      pfpr)
        print("Length {};".format(length),
              "pAUC at {0} = {1};".format(pfpr, auc))
        write_auc(output_auc + '/auc.txt', auc, length)
        write_roc(
            output_auc + "/training_bootstrap_merged_{0}.txt".format(length),
            merged_roc)
        write_roc(output_auc + "/training_bootstrap_{0}.txt".format(length),
                  roc)
    shutil.rmtree(tmp_dir)
    return (0)
コード例 #7
0
def learn_optimized_strum(peaks_path, backgroud_path, counter, tmp_dir,
                          output_auc, cpu_count, pfpr):
    if not os.path.exists(tmp_dir):
        os.mkdir(tmp_dir)
    if not os.path.isdir(output_auc):
        os.mkdir(output_auc)
    if os.path.exists(output_auc + '/auc.txt'):
        os.remove(output_auc + '/auc.txt')
    peaks = read_peaks(peaks_path)
    #for length in range(12, 41, 4):
    for length in range(8, 31, 4):
        true_scores = []
        false_scores = []
        sites = []
        for step in ['odd', 'even']:
            if step == 'odd':
                train_peaks = [
                    p for index, p in enumerate(peaks, 1) if index % 2 != 0
                ]
                test_peaks = [
                    p for index, p in enumerate(peaks, 1) if index % 2 == 0
                ]
            else:
                train_peaks = [
                    p for index, p in enumerate(peaks, 1) if index % 2 == 0
                ]
                test_peaks = [
                    p for index, p in enumerate(peaks, 1) if index % 2 != 0
                ]
            train_peaks = [
                p for index, p in enumerate(peaks, 1) if index % 2 != 0
            ]
            test_peaks = [
                p for index, p in enumerate(peaks, 1) if index % 2 == 0
            ]
            write_fasta(train_peaks, tmp_dir + '/train.fasta')
            if os.path.isfile(backgroud_path):
                shuffled_peaks = read_peaks(backgroud_path)
            else:
                shuffled_peaks = creat_background(test_peaks, length, counter)
            strum_model = strum_de_novo(tmp_dir + '/train.fasta', length,
                                        cpu_count)
            for true_score, site in zip(
                    *true_scores_strum(test_peaks, strum_model, length)):
                true_scores.append(true_score)
                sites.append(site)
            for false_score in false_scores_strum(shuffled_peaks, strum_model):
                false_scores.append(false_score)
        fprs = calculate_fprs(true_scores, false_scores)
        roc = calculate_short_roc(fprs, step=1)
        merged_roc = calculate_merged_roc(fprs)
        auc = calculate_particial_auc(merged_roc['TPR'], merged_roc['FPR'],
                                      pfpr)
        print("Length {};".format(length),
              "pAUC at {0} = {1};".format(pfpr, auc))
        write_auc(output_auc + '/auc.txt', auc, length)
        write_roc(
            output_auc + "/training_bootstrap_merged_{0}.txt".format(length),
            merged_roc)
        write_roc(output_auc + "/training_bootstrap_{0}.txt".format(length),
                  roc)
        tag = 'strum_model_{0}'.format(length)
        sites = [i for i in sites if not 'N' in i]
        pcm = make_pcm(sites)
        pfm = make_pfm(pcm)
        nsites = len(sites)
        background = {'A': 0.25, 'C': 0.25, 'G': 0.25, 'T': 0.25}
        write_strum(strum_model, output_auc + '/{}.pickle'.format(tag))
        write_meme(output_auc, tag, pfm, background, nsites)
    shutil.rmtree(tmp_dir)
    return (0)
コード例 #8
0
def learn_optimized_pwm(peaks_path, backgroud_path, counter, path_to_java,
                        path_to_chipmunk, tmp_r, output_auc, cpu_count, pfpr):
    if not os.path.exists(tmp_r):
        os.mkdir(tmp_r)
    if not os.path.isdir(output_auc):
        os.mkdir(output_auc)
    if os.path.exists(output_auc + '/auc.txt'):
        os.remove(output_auc + '/auc.txt')
    #for length in range(12, 41, 4):
    for length in range(8, 31, 4):
        true_scores = []
        false_scores = []
        peaks = read_peaks(peaks_path)
        for step in ['odd', 'even']:
            if step == 'odd':
                train_peaks = [
                    p for index, p in enumerate(peaks, 1) if index % 2 != 0
                ]
                test_peaks = [
                    p for index, p in enumerate(peaks, 1) if index % 2 == 0
                ]
            else:
                train_peaks = [
                    p for index, p in enumerate(peaks, 1) if index % 2 == 0
                ]
                test_peaks = [
                    p for index, p in enumerate(peaks, 1) if index % 2 != 0
                ]
            write_fasta(train_peaks, tmp_r + '/train.fasta')
            if os.path.isfile(backgroud_path):
                shuffled_peaks = read_peaks(backgroud_path)
            else:
                shuffled_peaks = creat_background(test_peaks, length, counter)
            run_chipmunk(path_to_java, path_to_chipmunk,
                         tmp_r + '/train.fasta',
                         tmp_r + '/chipmunk_results.txt', length, length,
                         cpu_count)
            sites = parse_chipmunk(tmp_r + '/chipmunk_results.txt')
            sites = list(set(sites))
            pwm = sites_to_pwm(sites)
            for true_score in true_scores_pwm(test_peaks, pwm, length):
                true_scores.append(true_score)
            for false_score in false_scores_pwm(shuffled_peaks, pwm, length):
                false_scores.append(false_score)
            pcm = make_pcm(sites)
            pfm = make_pfm(pcm)
            pwm = make_pwm(pfm)
            nsites = len(sites)
            background = {'A': 0.25, 'C': 0.25, 'G': 0.25, 'T': 0.25}
            tag = 'pwm_model_{0}_{1}'.format(step, length)
            write_meme(output_auc, tag, pfm, background, nsites)
            write_pwm(output_auc, tag, pwm)
            write_pfm(output_auc, tag, pfm)
            write_sites(output=output_auc, tag=tag, sites=sites)
        fprs = calculate_fprs(true_scores, false_scores)
        roc = calculate_short_roc(fprs, step=1)
        merged_roc = calculate_merged_roc(fprs)
        auc = calculate_particial_auc(merged_roc['TPR'], merged_roc['FPR'],
                                      pfpr)
        print("Length {};".format(length),
              "pAUC at {0} = {1};".format(pfpr, auc))
        write_auc(output_auc + '/auc.txt', auc, length)
        write_roc(
            output_auc + "/training_bootstrap_merged_{0}.txt".format(length),
            merged_roc)
        write_roc(output_auc + "/training_bootstrap_{0}.txt".format(length),
                  roc)
    shutil.rmtree(tmp_r)
    return (0)