def cross_validation_inmode(inmode_model, peaks_path, counter, length, path_to_inmode, path_to_java, tmp_dir, output_dir, pfpr): if not os.path.exists(tmp_dir): os.mkdir(tmp_dir) true_scores = [] false_scores = [] peaks = read_peaks(peaks_path) shuffled_peaks = creat_background(peaks, length, counter) write_fasta(shuffled_peaks, tmp_dir, "shuffled") write_fasta(peaks, tmp_dir, "train") for true_score in true_scores_inmode(inmode_model, path_to_inmode, path_to_java, tmp_dir, "train", 'current'): true_scores.append(true_score) for false_score in false_scores_inmode(inmode_model, path_to_inmode, path_to_java, tmp_dir, "shuffled", 'current'): false_scores.append(false_score) fprs = calculate_fprs(true_scores, false_scores) roc = calculate_short_roc(fprs, step=1) auc = calculate_particial_auc(roc['TPR'], roc['FPR'], pfpr) write_auc(output_dir + '/inmode_auc.txt', auc) write_roc(output_dir + "/inmode_cv.txt", roc) shutil.rmtree(tmp_dir) return (0)
def cross_validation_pwm(pwm, length, peaks_path, counter, output_dir, pfpr): true_scores = [] false_scores = [] peaks = read_peaks(peaks_path) shuffled_peaks = creat_background(peaks, length, counter) for true_score in true_scores_pwm(peaks, pwm, length): true_scores.append(true_score) for false_score in false_scores_pwm(shuffled_peaks, pwm, length): false_scores.append(false_score) fprs = calculate_fprs(true_scores, false_scores) roc = calculate_short_roc(fprs, step=1) auc = calculate_particial_auc(roc['TPR'], roc['FPR'], pfpr) write_auc(output_dir + '/pwm_auc.txt', auc) write_roc(output_dir + "/pwm_cv.txt", roc) return (0)
def learn_optimized_bamm_support(peaks_path, backgroud_path, counter, order, length, pwm_auc_dir, tmp_dir, output_dir, pfpr): true_scores = [] false_scores = [] peaks = read_peaks(peaks_path) for step in ['odd', 'even']: meme = pwm_auc_dir + '/pwm_model_{0}_{1}.meme'.format(step, length) if step == 'odd': train_peaks = [p for index, p in enumerate(peaks, 1) if index % 2 != 0] test_peaks = [p for index, p in enumerate(peaks, 1) if index % 2 == 0] else: train_peaks = [p for index, p in enumerate(peaks, 1) if index % 2 == 0] test_peaks = [p for index, p in enumerate(peaks, 1) if index % 2 != 0] write_fasta(train_peaks, tmp_dir + '/train.fasta') if os.path.isfile(backgroud_path): shuffled_peaks = read_peaks(backgroud_path) bamm, order = create_bamm_model(tmp_dir + '/train.fasta', backgroud_path, tmp_dir, order, meme, 0, length) else: shuffled_peaks = creat_background(test_peaks, length, counter) write_fasta(shuffled_peaks, tmp_dir + '/background.fasta') bamm, order = create_bamm_model(tmp_dir + '/train.fasta', tmp_dir + '/background.fasta', tmp_dir, order, meme, 0, length) for true_score in true_scores_bamm(test_peaks, bamm, order, length): true_scores.append(true_score) for false_score in false_scores_bamm(shuffled_peaks, bamm, order, length): false_scores.append(false_score) shutil.copy(tmp_dir + '/{}_motif_1.ihbcp'.format(length), output_dir + '/bamm_model_{0}_{1}_{2}.ihbcp'.format(step, order, length)) shutil.copy(tmp_dir + '/{}.hbcp'.format(length), output_dir + '/bamm_{0}_{1}_{2}.hbcp'.format(step, order, length)) fprs = calculate_fprs(true_scores, false_scores) roc = calculate_short_roc(fprs, step=1) merged_roc = calculate_merged_roc(fprs) auc = calculate_particial_auc(merged_roc['TPR'], merged_roc['FPR'], pfpr) print("Length {0}; Order {1}".format(length, order), "pAUC at {0} = {1};".format(pfpr, auc)) write_auc_with_order(output_dir + '/auc.txt', auc, length, order) write_roc(output_dir + "/training_bootstrap_{0}_{1}.txt".format(length, order), roc) write_roc(output_dir + "/training_bootstrap_merged_{0}_{1}.txt".format(length, order), merged_roc) return(0)
def learn_optimized_pwm(peaks_path, backgroud_path, counter, tmp_dir, output_auc, pfpr): if not os.path.exists(tmp_dir): os.mkdir(tmp_dir) if not os.path.isdir(output_auc): os.mkdir(output_auc) if os.path.exists(output_auc + '/auc.txt'): os.remove(output_auc + '/auc.txt') #for length in range(12, 41, 4): for length in range(8, 31, 4): true_scores = [] false_scores = [] peaks = read_peaks(peaks_path) for step in ['odd', 'even']: if step == 'odd': train_peaks = [ p for index, p in enumerate(peaks, 1) if index % 2 != 0 ] test_peaks = [ p for index, p in enumerate(peaks, 1) if index % 2 == 0 ] else: train_peaks = [ p for index, p in enumerate(peaks, 1) if index % 2 == 0 ] test_peaks = [ p for index, p in enumerate(peaks, 1) if index % 2 != 0 ] write_fasta(train_peaks, tmp_dir + '/train.fasta') if os.path.isfile(backgroud_path): shuffled_peaks = read_peaks(backgroud_path) run_streme(tmp_dir + '/train.fasta', backgroud_path, tmp_dir, length) else: shuffled_peaks = creat_background(test_peaks, length, counter) run_streme_hmm_background(tmp_dir + '/train.fasta', tmp_dir, length) pfm, background, length, nsites = parse_streme(tmp_dir + '/streme.txt') pwm = make_pwm(pfm) for true_score in true_scores_pwm(test_peaks, pwm, length): true_scores.append(true_score) for false_score in false_scores_pwm(shuffled_peaks, pwm, length): false_scores.append(false_score) tag = 'pwm_model_{0}_{1}'.format(step, length) write_meme(output_auc, tag, pfm, background, nsites) write_pwm(output_auc, tag, pwm) write_pfm(output_auc, tag, pfm) fprs = calculate_fprs(true_scores, false_scores) roc = calculate_short_roc(fprs, step=1) merged_roc = calculate_merged_roc(fprs) auc = calculate_particial_auc(merged_roc['TPR'], merged_roc['FPR'], pfpr) print("Length {};".format(length), "pAUC at {0} = {1};".format(pfpr, auc)) write_auc(output_auc + '/auc.txt', auc, length) write_roc( output_auc + "/training_bootstrap_merged_{0}.txt".format(length), merged_roc) write_roc(output_auc + "/training_bootstrap_{0}.txt".format(length), roc) shutil.rmtree(tmp_dir) return (0)
def learn_optimized_inmode(peaks_path, backgroud_path, counter, path_to_inmode, path_to_java, tmp_dir, output_auc, pfpr): if not os.path.exists(tmp_dir): os.mkdir(tmp_dir) if not os.path.isdir(output_auc): os.mkdir(output_auc) open(output_auc + '/auc.txt', 'w').close() for order in range(1, 4): #for length in range(12, 41, 4): for length in range(8, 31, 4): true_scores = [] false_scores = [] peaks = read_peaks(peaks_path) for step in ['odd', 'even']: if step == 'odd': train_peaks = [ p for index, p in enumerate(peaks, 1) if index % 2 != 0 ] test_peaks = [ p for index, p in enumerate(peaks, 1) if index % 2 == 0 ] else: train_peaks = [ p for index, p in enumerate(peaks, 1) if index % 2 == 0 ] test_peaks = [ p for index, p in enumerate(peaks, 1) if index % 2 != 0 ] if os.path.isfile(backgroud_path): shuffled_peaks = read_peaks(backgroud_path) else: shuffled_peaks = creat_background(test_peaks, length, counter) write_fasta(shuffled_peaks, tmp_dir, "shuffled") write_fasta(train_peaks, tmp_dir, "train") write_fasta(test_peaks, tmp_dir, "test") make_inmode('{0}/{1}.fa'.format(tmp_dir, 'train'), path_to_inmode, path_to_java, length, order, tmp_dir, str(length)) for true_score in true_scores_inmode(path_to_inmode, path_to_java, length, tmp_dir, "test", str(length), str(order)): true_scores.append(true_score) for false_score in false_scores_inmode(path_to_inmode, path_to_java, length, tmp_dir, "shuffled", str(length), str(order)): false_scores.append(false_score) shutil.copy( tmp_dir + '/inmode_model_{0}_{1}.xml'.format(order, length), output_auc + '/inmode_model_{0}_{1}_{2}.xml'.format( step, order, length)) fprs = calculate_fprs(true_scores, false_scores) roc = calculate_short_roc(fprs, step=1) merged_roc = calculate_merged_roc(fprs) auc = calculate_particial_auc(merged_roc['TPR'], merged_roc['FPR'], pfpr) print("Length {0}; Order {1}".format(length, order), "pAUC at {0} = {1};".format(pfpr, auc)) write_auc_with_order(output_auc + '/auc.txt', auc, length, order) write_roc( output_auc + "/training_bootstrap_merged_{0}_{1}.txt".format(length, order), merged_roc) write_roc( output_auc + "/training_bootstrap_{0}_{1}.txt".format(length, order), roc) shutil.rmtree(tmp_dir) return (0)
def copy_results_of_cv(outdir, models_dir, model, pfpr): roc = read_roc(models_dir + '/{}_model/bootstrap.txt'.format(model)) auc = calculate_particial_auc(roc['TPR'], roc['FPR'], pfpr) write_auc(outdir + '/{}_auc.txt'.format(model), auc) write_roc(outdir + '/{}_cv.txt'.format(model), roc) return (0)
def learn_optimized_strum(peaks_path, backgroud_path, counter, tmp_dir, output_auc, cpu_count, pfpr): if not os.path.exists(tmp_dir): os.mkdir(tmp_dir) if not os.path.isdir(output_auc): os.mkdir(output_auc) if os.path.exists(output_auc + '/auc.txt'): os.remove(output_auc + '/auc.txt') peaks = read_peaks(peaks_path) #for length in range(12, 41, 4): for length in range(8, 31, 4): true_scores = [] false_scores = [] sites = [] for step in ['odd', 'even']: if step == 'odd': train_peaks = [ p for index, p in enumerate(peaks, 1) if index % 2 != 0 ] test_peaks = [ p for index, p in enumerate(peaks, 1) if index % 2 == 0 ] else: train_peaks = [ p for index, p in enumerate(peaks, 1) if index % 2 == 0 ] test_peaks = [ p for index, p in enumerate(peaks, 1) if index % 2 != 0 ] train_peaks = [ p for index, p in enumerate(peaks, 1) if index % 2 != 0 ] test_peaks = [ p for index, p in enumerate(peaks, 1) if index % 2 == 0 ] write_fasta(train_peaks, tmp_dir + '/train.fasta') if os.path.isfile(backgroud_path): shuffled_peaks = read_peaks(backgroud_path) else: shuffled_peaks = creat_background(test_peaks, length, counter) strum_model = strum_de_novo(tmp_dir + '/train.fasta', length, cpu_count) for true_score, site in zip( *true_scores_strum(test_peaks, strum_model, length)): true_scores.append(true_score) sites.append(site) for false_score in false_scores_strum(shuffled_peaks, strum_model): false_scores.append(false_score) fprs = calculate_fprs(true_scores, false_scores) roc = calculate_short_roc(fprs, step=1) merged_roc = calculate_merged_roc(fprs) auc = calculate_particial_auc(merged_roc['TPR'], merged_roc['FPR'], pfpr) print("Length {};".format(length), "pAUC at {0} = {1};".format(pfpr, auc)) write_auc(output_auc + '/auc.txt', auc, length) write_roc( output_auc + "/training_bootstrap_merged_{0}.txt".format(length), merged_roc) write_roc(output_auc + "/training_bootstrap_{0}.txt".format(length), roc) tag = 'strum_model_{0}'.format(length) sites = [i for i in sites if not 'N' in i] pcm = make_pcm(sites) pfm = make_pfm(pcm) nsites = len(sites) background = {'A': 0.25, 'C': 0.25, 'G': 0.25, 'T': 0.25} write_strum(strum_model, output_auc + '/{}.pickle'.format(tag)) write_meme(output_auc, tag, pfm, background, nsites) shutil.rmtree(tmp_dir) return (0)
def learn_optimized_pwm(peaks_path, backgroud_path, counter, path_to_java, path_to_chipmunk, tmp_r, output_auc, cpu_count, pfpr): if not os.path.exists(tmp_r): os.mkdir(tmp_r) if not os.path.isdir(output_auc): os.mkdir(output_auc) if os.path.exists(output_auc + '/auc.txt'): os.remove(output_auc + '/auc.txt') #for length in range(12, 41, 4): for length in range(8, 31, 4): true_scores = [] false_scores = [] peaks = read_peaks(peaks_path) for step in ['odd', 'even']: if step == 'odd': train_peaks = [ p for index, p in enumerate(peaks, 1) if index % 2 != 0 ] test_peaks = [ p for index, p in enumerate(peaks, 1) if index % 2 == 0 ] else: train_peaks = [ p for index, p in enumerate(peaks, 1) if index % 2 == 0 ] test_peaks = [ p for index, p in enumerate(peaks, 1) if index % 2 != 0 ] write_fasta(train_peaks, tmp_r + '/train.fasta') if os.path.isfile(backgroud_path): shuffled_peaks = read_peaks(backgroud_path) else: shuffled_peaks = creat_background(test_peaks, length, counter) run_chipmunk(path_to_java, path_to_chipmunk, tmp_r + '/train.fasta', tmp_r + '/chipmunk_results.txt', length, length, cpu_count) sites = parse_chipmunk(tmp_r + '/chipmunk_results.txt') sites = list(set(sites)) pwm = sites_to_pwm(sites) for true_score in true_scores_pwm(test_peaks, pwm, length): true_scores.append(true_score) for false_score in false_scores_pwm(shuffled_peaks, pwm, length): false_scores.append(false_score) pcm = make_pcm(sites) pfm = make_pfm(pcm) pwm = make_pwm(pfm) nsites = len(sites) background = {'A': 0.25, 'C': 0.25, 'G': 0.25, 'T': 0.25} tag = 'pwm_model_{0}_{1}'.format(step, length) write_meme(output_auc, tag, pfm, background, nsites) write_pwm(output_auc, tag, pwm) write_pfm(output_auc, tag, pfm) write_sites(output=output_auc, tag=tag, sites=sites) fprs = calculate_fprs(true_scores, false_scores) roc = calculate_short_roc(fprs, step=1) merged_roc = calculate_merged_roc(fprs) auc = calculate_particial_auc(merged_roc['TPR'], merged_roc['FPR'], pfpr) print("Length {};".format(length), "pAUC at {0} = {1};".format(pfpr, auc)) write_auc(output_auc + '/auc.txt', auc, length) write_roc( output_auc + "/training_bootstrap_merged_{0}.txt".format(length), merged_roc) write_roc(output_auc + "/training_bootstrap_{0}.txt".format(length), roc) shutil.rmtree(tmp_r) return (0)