def cross_validation_inmode(inmode_model, peaks_path, counter, length,
                            path_to_inmode, path_to_java, tmp_dir, output_dir,
                            pfpr):
    if not os.path.exists(tmp_dir):
        os.mkdir(tmp_dir)
    true_scores = []
    false_scores = []
    peaks = read_peaks(peaks_path)
    shuffled_peaks = creat_background(peaks, length, counter)
    write_fasta(shuffled_peaks, tmp_dir, "shuffled")
    write_fasta(peaks, tmp_dir, "train")
    for true_score in true_scores_inmode(inmode_model, path_to_inmode,
                                         path_to_java, tmp_dir, "train",
                                         'current'):
        true_scores.append(true_score)
    for false_score in false_scores_inmode(inmode_model, path_to_inmode,
                                           path_to_java, tmp_dir, "shuffled",
                                           'current'):
        false_scores.append(false_score)
    fprs = calculate_fprs(true_scores, false_scores)
    roc = calculate_short_roc(fprs, step=1)
    auc = calculate_particial_auc(roc['TPR'], roc['FPR'], pfpr)
    write_auc(output_dir + '/inmode_auc.txt', auc)
    write_roc(output_dir + "/inmode_cv.txt", roc)
    shutil.rmtree(tmp_dir)
    return (0)
def cross_validation_pwm(pwm, length, peaks_path, counter, output_dir, pfpr):
    true_scores = []
    false_scores = []
    peaks = read_peaks(peaks_path)
    shuffled_peaks = creat_background(peaks, length, counter)
    for true_score in true_scores_pwm(peaks, pwm, length):
        true_scores.append(true_score)
    for false_score in false_scores_pwm(shuffled_peaks, pwm, length):
        false_scores.append(false_score)
    fprs = calculate_fprs(true_scores, false_scores)
    roc = calculate_short_roc(fprs, step=1)
    auc = calculate_particial_auc(roc['TPR'], roc['FPR'], pfpr)
    write_auc(output_dir + '/pwm_auc.txt', auc)
    write_roc(output_dir + "/pwm_cv.txt", roc)
    return (0)
Example #3
0
def learn_optimized_bamm_support(peaks_path, backgroud_path, counter, order, length, pwm_auc_dir, tmp_dir, output_dir, pfpr):
    true_scores = []
    false_scores = []
    peaks = read_peaks(peaks_path)
    for step in ['odd', 'even']:
        meme = pwm_auc_dir + '/pwm_model_{0}_{1}.meme'.format(step, length)
        if step == 'odd':
            train_peaks = [p for index, p in enumerate(peaks, 1) if index % 2 != 0]
            test_peaks = [p for index, p in enumerate(peaks, 1) if index % 2 == 0]
        else:
            train_peaks = [p for index, p in enumerate(peaks, 1) if index % 2 == 0]
            test_peaks = [p for index, p in enumerate(peaks, 1) if index % 2 != 0]                
        write_fasta(train_peaks, tmp_dir + '/train.fasta')
        if os.path.isfile(backgroud_path):
            shuffled_peaks = read_peaks(backgroud_path)
            bamm, order = create_bamm_model(tmp_dir + '/train.fasta', backgroud_path, tmp_dir, order, meme, 0, length)
        else:
            shuffled_peaks = creat_background(test_peaks, length, counter)
            write_fasta(shuffled_peaks, tmp_dir + '/background.fasta')
            bamm, order = create_bamm_model(tmp_dir + '/train.fasta', tmp_dir + '/background.fasta', tmp_dir, order, meme, 0, length)
        for true_score in true_scores_bamm(test_peaks, bamm, order, length):
            true_scores.append(true_score)
        for false_score in false_scores_bamm(shuffled_peaks, bamm, order, length):
            false_scores.append(false_score)
        shutil.copy(tmp_dir + '/{}_motif_1.ihbcp'.format(length),
               output_dir + '/bamm_model_{0}_{1}_{2}.ihbcp'.format(step, order, length))
        shutil.copy(tmp_dir + '/{}.hbcp'.format(length),
               output_dir + '/bamm_{0}_{1}_{2}.hbcp'.format(step, order, length))
    fprs = calculate_fprs(true_scores, false_scores)
    roc = calculate_short_roc(fprs, step=1)
    merged_roc = calculate_merged_roc(fprs)
    auc = calculate_particial_auc(merged_roc['TPR'], merged_roc['FPR'], pfpr)
    print("Length {0}; Order {1}".format(length, order), "pAUC at {0} = {1};".format(pfpr, auc))
    write_auc_with_order(output_dir + '/auc.txt', auc, length, order)
    write_roc(output_dir + "/training_bootstrap_{0}_{1}.txt".format(length, order), roc)
    write_roc(output_dir + "/training_bootstrap_merged_{0}_{1}.txt".format(length, order), merged_roc)
    return(0)
Example #4
0
def learn_optimized_pwm(peaks_path, backgroud_path, counter, tmp_dir,
                        output_auc, pfpr):
    if not os.path.exists(tmp_dir):
        os.mkdir(tmp_dir)
    if not os.path.isdir(output_auc):
        os.mkdir(output_auc)
    if os.path.exists(output_auc + '/auc.txt'):
        os.remove(output_auc + '/auc.txt')
    #for length in range(12, 41, 4):
    for length in range(8, 31, 4):
        true_scores = []
        false_scores = []
        peaks = read_peaks(peaks_path)
        for step in ['odd', 'even']:
            if step == 'odd':
                train_peaks = [
                    p for index, p in enumerate(peaks, 1) if index % 2 != 0
                ]
                test_peaks = [
                    p for index, p in enumerate(peaks, 1) if index % 2 == 0
                ]
            else:
                train_peaks = [
                    p for index, p in enumerate(peaks, 1) if index % 2 == 0
                ]
                test_peaks = [
                    p for index, p in enumerate(peaks, 1) if index % 2 != 0
                ]
            write_fasta(train_peaks, tmp_dir + '/train.fasta')
            if os.path.isfile(backgroud_path):
                shuffled_peaks = read_peaks(backgroud_path)
                run_streme(tmp_dir + '/train.fasta', backgroud_path, tmp_dir,
                           length)
            else:
                shuffled_peaks = creat_background(test_peaks, length, counter)
                run_streme_hmm_background(tmp_dir + '/train.fasta', tmp_dir,
                                          length)
            pfm, background, length, nsites = parse_streme(tmp_dir +
                                                           '/streme.txt')
            pwm = make_pwm(pfm)
            for true_score in true_scores_pwm(test_peaks, pwm, length):
                true_scores.append(true_score)
            for false_score in false_scores_pwm(shuffled_peaks, pwm, length):
                false_scores.append(false_score)
            tag = 'pwm_model_{0}_{1}'.format(step, length)
            write_meme(output_auc, tag, pfm, background, nsites)
            write_pwm(output_auc, tag, pwm)
            write_pfm(output_auc, tag, pfm)
        fprs = calculate_fprs(true_scores, false_scores)
        roc = calculate_short_roc(fprs, step=1)
        merged_roc = calculate_merged_roc(fprs)
        auc = calculate_particial_auc(merged_roc['TPR'], merged_roc['FPR'],
                                      pfpr)
        print("Length {};".format(length),
              "pAUC at {0} = {1};".format(pfpr, auc))
        write_auc(output_auc + '/auc.txt', auc, length)
        write_roc(
            output_auc + "/training_bootstrap_merged_{0}.txt".format(length),
            merged_roc)
        write_roc(output_auc + "/training_bootstrap_{0}.txt".format(length),
                  roc)
    shutil.rmtree(tmp_dir)
    return (0)
def learn_optimized_inmode(peaks_path, backgroud_path, counter, path_to_inmode,
                           path_to_java, tmp_dir, output_auc, pfpr):
    if not os.path.exists(tmp_dir):
        os.mkdir(tmp_dir)
    if not os.path.isdir(output_auc):
        os.mkdir(output_auc)
    open(output_auc + '/auc.txt', 'w').close()
    for order in range(1, 4):
        #for length in range(12, 41, 4):
        for length in range(8, 31, 4):
            true_scores = []
            false_scores = []
            peaks = read_peaks(peaks_path)
            for step in ['odd', 'even']:
                if step == 'odd':
                    train_peaks = [
                        p for index, p in enumerate(peaks, 1) if index % 2 != 0
                    ]
                    test_peaks = [
                        p for index, p in enumerate(peaks, 1) if index % 2 == 0
                    ]
                else:
                    train_peaks = [
                        p for index, p in enumerate(peaks, 1) if index % 2 == 0
                    ]
                    test_peaks = [
                        p for index, p in enumerate(peaks, 1) if index % 2 != 0
                    ]
                if os.path.isfile(backgroud_path):
                    shuffled_peaks = read_peaks(backgroud_path)
                else:
                    shuffled_peaks = creat_background(test_peaks, length,
                                                      counter)
                write_fasta(shuffled_peaks, tmp_dir, "shuffled")
                write_fasta(train_peaks, tmp_dir, "train")
                write_fasta(test_peaks, tmp_dir, "test")
                make_inmode('{0}/{1}.fa'.format(tmp_dir,
                                                'train'), path_to_inmode,
                            path_to_java, length, order, tmp_dir, str(length))
                for true_score in true_scores_inmode(path_to_inmode,
                                                     path_to_java, length,
                                                     tmp_dir, "test",
                                                     str(length), str(order)):
                    true_scores.append(true_score)
                for false_score in false_scores_inmode(path_to_inmode,
                                                       path_to_java, length,
                                                       tmp_dir, "shuffled",
                                                       str(length),
                                                       str(order)):
                    false_scores.append(false_score)
                shutil.copy(
                    tmp_dir +
                    '/inmode_model_{0}_{1}.xml'.format(order, length),
                    output_auc + '/inmode_model_{0}_{1}_{2}.xml'.format(
                        step, order, length))
            fprs = calculate_fprs(true_scores, false_scores)
            roc = calculate_short_roc(fprs, step=1)
            merged_roc = calculate_merged_roc(fprs)
            auc = calculate_particial_auc(merged_roc['TPR'], merged_roc['FPR'],
                                          pfpr)
            print("Length {0}; Order {1}".format(length, order),
                  "pAUC at {0} = {1};".format(pfpr, auc))
            write_auc_with_order(output_auc + '/auc.txt', auc, length, order)
            write_roc(
                output_auc +
                "/training_bootstrap_merged_{0}_{1}.txt".format(length, order),
                merged_roc)
            write_roc(
                output_auc +
                "/training_bootstrap_{0}_{1}.txt".format(length, order), roc)
    shutil.rmtree(tmp_dir)
    return (0)
def copy_results_of_cv(outdir, models_dir, model, pfpr):
    roc = read_roc(models_dir + '/{}_model/bootstrap.txt'.format(model))
    auc = calculate_particial_auc(roc['TPR'], roc['FPR'], pfpr)
    write_auc(outdir + '/{}_auc.txt'.format(model), auc)
    write_roc(outdir + '/{}_cv.txt'.format(model), roc)
    return (0)
Example #7
0
def learn_optimized_strum(peaks_path, backgroud_path, counter, tmp_dir,
                          output_auc, cpu_count, pfpr):
    if not os.path.exists(tmp_dir):
        os.mkdir(tmp_dir)
    if not os.path.isdir(output_auc):
        os.mkdir(output_auc)
    if os.path.exists(output_auc + '/auc.txt'):
        os.remove(output_auc + '/auc.txt')
    peaks = read_peaks(peaks_path)
    #for length in range(12, 41, 4):
    for length in range(8, 31, 4):
        true_scores = []
        false_scores = []
        sites = []
        for step in ['odd', 'even']:
            if step == 'odd':
                train_peaks = [
                    p for index, p in enumerate(peaks, 1) if index % 2 != 0
                ]
                test_peaks = [
                    p for index, p in enumerate(peaks, 1) if index % 2 == 0
                ]
            else:
                train_peaks = [
                    p for index, p in enumerate(peaks, 1) if index % 2 == 0
                ]
                test_peaks = [
                    p for index, p in enumerate(peaks, 1) if index % 2 != 0
                ]
            train_peaks = [
                p for index, p in enumerate(peaks, 1) if index % 2 != 0
            ]
            test_peaks = [
                p for index, p in enumerate(peaks, 1) if index % 2 == 0
            ]
            write_fasta(train_peaks, tmp_dir + '/train.fasta')
            if os.path.isfile(backgroud_path):
                shuffled_peaks = read_peaks(backgroud_path)
            else:
                shuffled_peaks = creat_background(test_peaks, length, counter)
            strum_model = strum_de_novo(tmp_dir + '/train.fasta', length,
                                        cpu_count)
            for true_score, site in zip(
                    *true_scores_strum(test_peaks, strum_model, length)):
                true_scores.append(true_score)
                sites.append(site)
            for false_score in false_scores_strum(shuffled_peaks, strum_model):
                false_scores.append(false_score)
        fprs = calculate_fprs(true_scores, false_scores)
        roc = calculate_short_roc(fprs, step=1)
        merged_roc = calculate_merged_roc(fprs)
        auc = calculate_particial_auc(merged_roc['TPR'], merged_roc['FPR'],
                                      pfpr)
        print("Length {};".format(length),
              "pAUC at {0} = {1};".format(pfpr, auc))
        write_auc(output_auc + '/auc.txt', auc, length)
        write_roc(
            output_auc + "/training_bootstrap_merged_{0}.txt".format(length),
            merged_roc)
        write_roc(output_auc + "/training_bootstrap_{0}.txt".format(length),
                  roc)
        tag = 'strum_model_{0}'.format(length)
        sites = [i for i in sites if not 'N' in i]
        pcm = make_pcm(sites)
        pfm = make_pfm(pcm)
        nsites = len(sites)
        background = {'A': 0.25, 'C': 0.25, 'G': 0.25, 'T': 0.25}
        write_strum(strum_model, output_auc + '/{}.pickle'.format(tag))
        write_meme(output_auc, tag, pfm, background, nsites)
    shutil.rmtree(tmp_dir)
    return (0)
def learn_optimized_pwm(peaks_path, backgroud_path, counter, path_to_java,
                        path_to_chipmunk, tmp_r, output_auc, cpu_count, pfpr):
    if not os.path.exists(tmp_r):
        os.mkdir(tmp_r)
    if not os.path.isdir(output_auc):
        os.mkdir(output_auc)
    if os.path.exists(output_auc + '/auc.txt'):
        os.remove(output_auc + '/auc.txt')
    #for length in range(12, 41, 4):
    for length in range(8, 31, 4):
        true_scores = []
        false_scores = []
        peaks = read_peaks(peaks_path)
        for step in ['odd', 'even']:
            if step == 'odd':
                train_peaks = [
                    p for index, p in enumerate(peaks, 1) if index % 2 != 0
                ]
                test_peaks = [
                    p for index, p in enumerate(peaks, 1) if index % 2 == 0
                ]
            else:
                train_peaks = [
                    p for index, p in enumerate(peaks, 1) if index % 2 == 0
                ]
                test_peaks = [
                    p for index, p in enumerate(peaks, 1) if index % 2 != 0
                ]
            write_fasta(train_peaks, tmp_r + '/train.fasta')
            if os.path.isfile(backgroud_path):
                shuffled_peaks = read_peaks(backgroud_path)
            else:
                shuffled_peaks = creat_background(test_peaks, length, counter)
            run_chipmunk(path_to_java, path_to_chipmunk,
                         tmp_r + '/train.fasta',
                         tmp_r + '/chipmunk_results.txt', length, length,
                         cpu_count)
            sites = parse_chipmunk(tmp_r + '/chipmunk_results.txt')
            sites = list(set(sites))
            pwm = sites_to_pwm(sites)
            for true_score in true_scores_pwm(test_peaks, pwm, length):
                true_scores.append(true_score)
            for false_score in false_scores_pwm(shuffled_peaks, pwm, length):
                false_scores.append(false_score)
            pcm = make_pcm(sites)
            pfm = make_pfm(pcm)
            pwm = make_pwm(pfm)
            nsites = len(sites)
            background = {'A': 0.25, 'C': 0.25, 'G': 0.25, 'T': 0.25}
            tag = 'pwm_model_{0}_{1}'.format(step, length)
            write_meme(output_auc, tag, pfm, background, nsites)
            write_pwm(output_auc, tag, pwm)
            write_pfm(output_auc, tag, pfm)
            write_sites(output=output_auc, tag=tag, sites=sites)
        fprs = calculate_fprs(true_scores, false_scores)
        roc = calculate_short_roc(fprs, step=1)
        merged_roc = calculate_merged_roc(fprs)
        auc = calculate_particial_auc(merged_roc['TPR'], merged_roc['FPR'],
                                      pfpr)
        print("Length {};".format(length),
              "pAUC at {0} = {1};".format(pfpr, auc))
        write_auc(output_auc + '/auc.txt', auc, length)
        write_roc(
            output_auc + "/training_bootstrap_merged_{0}.txt".format(length),
            merged_roc)
        write_roc(output_auc + "/training_bootstrap_{0}.txt".format(length),
                  roc)
    shutil.rmtree(tmp_r)
    return (0)