def annotate_all_bin_bed(wig_f, out_f, chrom): #Open the input and output. wig = open(wig_f, "r") out = open(out_f, "w") #We choose the 97th and the 90th percentiles for promoter and enhancer, respectively. promoter_cutoff = wsu.get_intensity_percentile(0.97, wig, 0) enhancer_cutoff = wsu.get_intensity_percentile(0.90, wig, 0) #Loop through the WIG file and annotate each region. Skip the first two lines, which are the header. junk = wig.readline() junk = wig.readline() bin = wig.readline().split("\t") while len(bin) > 1: #Output the starting and ending positions. end = str(int(bin[0]) + BIN_SIZE) out.write("chr" + chrom + "\t" + bin[0] + "\t" + end + "\t") #Get the annotation. rpkm = float(bin[1]) if rpkm >= promoter_cutoff: out.write("Promoter\n") elif rpkm >= enhancer_cutoff: out.write("Enhancer\n") else: out.write("Weak\n") #Get the next line. bin = wig.readline().split("\t") #Close the input and output. wig.close() out.close()
def save_significant(percentages, magnitudes, wig_name, out_name, chrom, cell, min_val): #Get threshold to use in printing. wig = open(wig_name, 'r') out = open(out_name, 'w') intensity = wsu.get_intensity_percentile(0.995, wig, min_val) wig.close() scale = 5.0 / intensity perc_string = percentages.astype(str) #Print significant magnitudes with labels. labels = ["Promoter", "Enhancer", "Polycomb", "Weak"] for j in range(0, percentages.shape[1]): was_significant = False for i in range(0, percentages.shape[0]): other_percents = percentages[np.arange(percentages.shape[0]) != i, j] if labels[i] != "Polycomb" and percentages[i, j] > 0.5 and np.max( other_percents) <= percentages[i, j] / 2: out.write(labels[i] + "\t" + cell + "_" + chrom + "_" + magnitudes[j] + "\t" + magnitudes[j] + "\n") was_significant = True if not was_significant: out.write("Unknown" + "\t" + cell + "_" + chrom + "_" + magnitudes[j] + "\t" + magnitudes[j] + "\n") #Close the files. wig.close() out.close()
def save_significant(percentages, shape_names, shapes, wig_name, out_name, chrom, cell, min_val, cutoff): #Get threshold to use in printing. wig = open(wig_name, 'r') out = open(out_name, 'a') perc_out = open(out_name + "_percents", 'a') percentage_out = open(out_name + "_" + cell + "/" + chrom + "percentages", 'w') intensity = wsu.get_intensity_percentile(0.995, wig, min_val) print("\n") wig.close() scale = 5.0 / intensity perc_string = percentages.astype(str) #Print significant shapes with labels. labels = ["Promoter", "Enhancer", "Polycomb", "Weak"] for j in range(0, percentages.shape[1]): split_clust = shapes[j].split(",") scaled_clust = [float(i) for i in split_clust] * np.tile( scale, len(split_clust)) scaled_clust_str = [str(i) for i in scaled_clust] perc_str = [str(i) for i in percentages[:, j]] joined = ','.join(scaled_clust_str) perc_joined = ','.join(perc_str) was_significant = False for i in range(0, percentages.shape[0]): other_percents = percentages[np.arange(percentages.shape[0]) != i, j] if labels[i] != "Polycomb" and percentages[ i, j] > cutoff and np.max(other_percents) < percentages[i, j]: out.write(labels[i] + "\t" + cell + "_" + chrom + "_" + shape_names[j] + "\t" + joined) perc_out.write(labels[i] + "\t" + cell + "_" + chrom + "_" + shape_names[j] + "\t" + perc_joined) if joined.find('\n') != len(joined) - 1: perc_out.write("\n") out.write("\n") was_significant = True if not was_significant: out.write("Unknown" + "\t" + cell + "_" + chrom + "_" + shape_names[j] + "\t" + joined) perc_out.write("Unknown" + "\t" + cell + "_" + chrom + "_" + shape_names[j] + "\t" + perc_joined) if joined.find('\n') != len(joined) - 1: perc_out.write("\n") out.write("\n") #Print all percentages to a file to use later. percentage_out.write(','.join(perc_string[:, j]) + "\n") #Close the files. wig.close() out.close() percentage_out.close()
def main(): #Read in the bed file and shape data. bed = np.genfromtxt(sys.argv[1], delimiter='\t', dtype=str) output = sys.argv[3] wig = sys.argv[4] chromosome = sys.argv[5] cell = sys.argv[6] anno_file = sys.argv[7] min_val = float(sys.argv[8]) #shape annotations are in column 3. Biological annotations are in column 4. shape_col = 3 bio_col = 8 shape_start = 1 shape_end = 2 bio_start = 6 bio_end = 7 bio_len = 9 #Get list of shapes. shapes = [] shape_file = open(sys.argv[2], 'r') next_clust = shape_file.readline() while next_clust: shapes.append(next_clust) next_clust = shape_file.readline() unique_clusts = sorted(list(set(bed[:, shape_col])), key=lambda x: float(x)) #Get percentage of each shape and each chromHMM annotation and each chromHMM annotation per shape. wig_file = open(wig, "r") threshold = wsu.get_intensity_percentile(0.75, wig_file, min_val) wig_file.close() total_percent_shapes = get_shape_percentages(shape_col, shape_start, shape_end, unique_clusts, bed) total_percent_anno = get_anno_percentages(bio_col, shape_start, shape_end, bio_len, bed) [total_percent_all, total_sums_all ] = get_all_percentage_pairs(shape_col, bio_col, shape_start, shape_end, bio_start, bio_end, bio_len, unique_clusts, bed, threshold, anno_file, shapes) #Print all shapes with significant annotations, along with their annotations. save_significant(total_percent_all, unique_clusts, shapes, wig, output, chromosome, cell, min_val)
def get_all_precision_or_recall(bed, sig, wig, chrom): #Get actual annotation and ground truth for all annotations and for all unannotated regions. threshold = wsu.get_intensity_percentile(0.75, open(wig, 'r'), 0) annotations = ["Promoter", "Enhancer", "Weak"] length = len(annotations) #Get precision and recall for each type. [pred, gt] = ppr.get_labels_and_ground_truth(bed, sig, wig, annotations, threshold) precision = dict() recall = dict() if len(pred) > 0: for i in range(length): precision[i] = precision_score(gt[:, i], pred[:, i]) recall[i] = recall_score(gt[:, i], pred[:, i]) return [chrom, precision, recall]
def main(): file_path = sys.argv[1] output_path = sys.argv[2] bin_size = int(sys.argv[3]) region_size = int(sys.argv[4]) wig_file = open(sys.argv[5], 'r') percentile = 0.995 fine = bool(sys.argv[6]) minimum_intensity = float(sys.argv[7]) threshold = wsu.get_intensity_percentile(percentile, wig_file, minimum_intensity, fine) print(str(threshold)) shiftRegions(file_path, output_path, bin_size, region_size, threshold, minimum_intensity) #Print message to user. print("Shifting complete for all windows.")
def get_all_precision_and_recall(bed, sig, wig, chrom): #Get actual annotation and ground truth for all annotations and for all unannotated regions. threshold = wsu.get_intensity_percentile(0.75, open(wig, 'r'), BIN_SIZE) annotations = ["Promoter", "Enhancer", "Weak"] length = len(annotations) ground_truth_list = [] predicted_list = [] #Get precision and recall for each type. [pred, gt] = get_labels_and_ground_truth(bed, sig, wig, annotations, threshold) precision = dict() recall = dict() fpr = dict() for i in range(length): precision[i] = precision_score(gt[:, i], pred[:, i]) recall[i] = recall_score(gt[:, i], pred[:, i]) fp = len(np.where((pred[:, i] == 1) & (gt[:, i] == 0))[0]) tn = len(np.where((pred[:, i] == 0) & (gt[:, i] == 0))[0]) fpr[i] = fp / (fp + tn) return [precision, recall, pred.shape[0], threshold, pred, gt, fpr]
def main(): #Read in the bed file and shape data. bed = np.genfromtxt(sys.argv[1], delimiter='\t', dtype=str) output = sys.argv[2] wig = sys.argv[3] chromosome = sys.argv[4] cell = sys.argv[5] anno_file = sys.argv[6] min_val = float(sys.argv[7]) #magnitude annotations are in column 3. Biological annotations are in column 4. magnitude_col = 3 bio_col = 7 magnitude_start = 1 magnitude_end = 2 bio_start = 5 bio_end = 6 bio_len = 8 #Get list of magnitudes. unique_magnitudes = sorted(list(set(bed[:, magnitude_col])), key=lambda x: float(x)) print(unique_magnitudes) #Get percentage of each shape and each chromHMM annotation and each chromHMM annotation per shape. wig_file = open(wig, "r") threshold = wsu.get_intensity_percentile(0.75, wig_file, min_val) wig_file.close() [total_percent_all, total_sums_all ] = get_all_percentage_pairs(magnitude_col, bio_col, magnitude_start, magnitude_end, bio_start, bio_end, bio_len, unique_magnitudes, bed, threshold, anno_file) #Print all shapes with significant annotations, along with their annotations. save_significant(total_percent_all, unique_magnitudes, wig, output, chromosome, cell, min_val)
def get_all_precision_and_recall(bed, sig, tss_bed, tss_sig, or_bed, or_sig, and_bed, and_sig, rpkm_bed, rpkm_sig, wig, chrom, cell): #Get actual annotation and ground truth for all annotations and for all unannotated regions. threshold = wsu.get_intensity_percentile(0.75, open(wig, 'r'), 0) annotations = ["Promoter", "Enhancer", "Weak"] length = len(annotations) ground_truth_list = [] predicted_list = [] #Get precision and recall for each type. [pred, gt] = get_labels_and_ground_truth(bed, sig, wig, annotations, threshold) precision = dict() recall = dict() fpr = dict() for i in range(length): precision[i] = precision_score(gt[:, i], pred[:, i]) recall[i] = recall_score(gt[:, i], pred[:, i]) fp = len(np.where((pred[:, i] == 1) & (gt[:, i] == 0))[0]) tn = len(np.where((pred[:, i] == 0) & (gt[:, i] == 0))[0]) fpr[i] = fp / (fp + tn) #Get precision and recall for TSS-based promoters. [pred_promoter, gt_promoter ] = get_tss_labels_and_ground_truth(tss_bed, tss_sig, wig, ["Promoter", "Not_Promoter"], threshold) precision["tss"] = precision_score(gt_promoter[:, 0], pred_promoter[:, 0]) recall["tss"] = recall_score(gt_promoter[:, 0], pred_promoter[:, 0]) fp = len( np.where((pred_promoter[:, 0] == 1) & (gt_promoter[:, 0] == 0))[0]) tn = len( np.where((pred_promoter[:, 0] == 0) & (gt_promoter[:, 0] == 0))[0]) fpr["tss"] = fp / (fp + tn) #Get precision and recall for combined annotations (OR). [pred_or, gt_or] = get_labels_and_ground_truth(or_bed, or_sig, wig, annotations, threshold) precision_or = dict() recall_or = dict() fpr_or = dict() for i in range(length): precision_or[i] = precision_score(gt_or[:, i], pred_or[:, i]) recall_or[i] = recall_score(gt_or[:, i], pred_or[:, i]) fp = len(np.where((pred_or[:, i] == 1) & (gt_or[:, i] == 0))[0]) tn = len(np.where((pred_or[:, i] == 0) & (gt_or[:, i] == 0))[0]) fpr_or[i] = fp / (fp + tn) #Get precision and recall for combined annotations (AND). [pred_and, gt_and] = get_labels_and_ground_truth(and_bed, and_sig, wig, annotations, threshold) precision_and = dict() recall_and = dict() fpr_and = dict() for i in range(length): precision_and[i] = precision_score(gt_and[:, i], pred_and[:, i]) recall_and[i] = recall_score(gt_and[:, i], pred_and[:, i]) fp = len(np.where((pred_and[:, i] == 1) & (gt_and[:, i] == 0))[0]) tn = len(np.where((pred_and[:, i] == 0) & (gt_and[:, i] == 0))[0]) fpr_and[i] = fp / (fp + tn) #Get precision and recall for RPKM annotations. [pred_rpkm, gt_rpkm] = get_labels_and_ground_truth(rpkm_bed, rpkm_sig, wig, annotations, threshold) precision_rpkm = dict() recall_rpkm = dict() fpr_rpkm = dict() for i in range(length): precision_rpkm[i] = precision_score(gt_rpkm[:, i], pred_rpkm[:, i]) recall_rpkm[i] = recall_score(gt_rpkm[:, i], pred_rpkm[:, i]) fp = len(np.where((pred_rpkm[:, i] == 1) & (gt_rpkm[:, i] == 0))[0]) tn = len(np.where((pred_rpkm[:, i] == 0) & (gt_rpkm[:, i] == 0))[0]) fpr_rpkm[i] = fp / (fp + tn) return [ precision, recall, precision_or, recall_or, precision_and, recall_and, precision_rpkm, recall_rpkm, pred.shape[0], threshold, pred, gt, pred_and, gt_and, fpr, fpr_or, fpr_and, fpr_rpkm ]
def match_shapes_cutoff(in_file_name, shape_file_name, out_dir, wig_name, cutoff): #Get threshold to use in printing. wig = open(wig_name, 'r') intensity = wsu.get_intensity_percentile(0.995, wig, 0) if intensity == 0: intensity = 0.1 print("\n") wig.close() scale = 5.0 / intensity #Open input and shape files. in_file = open(in_file_name, "r") shape_file = open(shape_file_name, "r") #Open output files. out_file = open(out_dir, "w") out_clust = open(out_dir + "clust", "w") #Read in shape data. shapes = [] shape_anno = [] shape_name = [] next_shape = shape_file.readline() counter = 0 while next_shape: split_tabs = next_shape.split("\t") shape_anno.append(split_tabs[1]) shape_name.append(split_tabs[0]) shapes.append([float(i) for i in split_tabs[2].split(",")]) next_shape = shape_file.readline() shape_file.close() #Read in each line in the file and map it. if (len(shapes) > 1): next_line = in_file.readline() while (next_line): #Get the chromosome and position info to print to the output file. #Get the input signal to match with the shapes. inputStr = [] labels = [] split_line = next_line.split(",") labels = list(split_line[0:3]) inputStr = list(split_line[3:len(split_line)]) input = [float(i) for i in inputStr] input_scaled = input * np.tile(scale, len(input)) #Match the data to the nearest shape and obtain the match and the ambiguity metric. [match, ambig, crosscorr, out_str] = match_region(input_scaled, shapes, shape_anno) #If the cross-correlation is within the threshold, assign the label as the correct label for the region. #Otherwise, assign it as unknown. anno_label = "Unknown" anno_name = "Unknown" if crosscorr >= cutoff and match != -1: anno_label = shape_anno[match] anno_name = shape_name[match] #Print match to BED file. Format is: #chrom start end shape_num 1 - ambiguity #Score is the opposite of the ambiguity metric. out_file.write("chr" + labels[0] + "\t" + labels[1] + "\t" + labels[2] + "\t" + anno_label + "\t" + str(1 - ambig) + "\t" + out_str + "\n") out_clust.write("chr" + labels[0] + "\t" + labels[1] + "\t" + labels[2] + "\t" + anno_name + "\t" + str(1 - ambig) + "\t" + out_str + "\n") #Read the next line in the file. next_line = in_file.readline() #Print a message to the user. print("Files done") else: print("Only one shape. No annotation performed.") out_file.close() out_clust.close() in_file.close()