def get_all_percentage_pairs(anno, chrom_hmm_anno, start, end, chrom_hmm_start, chrom_hmm_end, chrom_hmm_len, shapes, bed, thresh, signals_path, shape_str): #Set up percentage matrix. sum_matrix = np.zeros((2, len(shapes))) cumulative_vec = np.zeros(len(shapes)) #Loop through bed file to compute percentage for each region. current_start = -1 current_end = -1 current_clust = "none" prev_start = -1 prev_end = -1 sigs = open(signals_path, "r") junk = 0 for j in range(0, 2): #Loop through BED file twice. next_signal = sigs.readline().split(",") if j == 1: next_signal = sigs.readline().split(",") for i in range(0, bed.shape[0]): #Get the previous data, if applicable. if i > 0: prev_start = int(bed[i - 1, start]) prev_end = int(bed[i - 1, end]) prev_clust = bed[i - 1, anno] #Get the next element data. next_line = bed[i, :] current_start = int(next_line[start]) current_end = int(next_line[end]) current_clust = next_line[anno] a = next_line[chrom_hmm_anno] idx = shapes.index(current_clust) clust_sig = [float(i) for i in shape_str[idx].split(",")] #Get the signal data. if (prev_start >= int(next_signal[1]) or current_start > int( next_signal[1])) and current_start > prev_start: next_signal = sigs.readline().split(",") if current_start > int(next_signal[1]): next_signal = sigs.readline().split(",") if int(next_signal[1]) == current_start: #Add to the existing percentages. region = [float(i) for i in next_signal[3:len(next_signal)]] count_clust = wsu.count_above(thresh, "", clust_sig, 0, len(clust_sig) * BIN_SIZE, 0, 0, BIN_SIZE) count_a = wsu.count_above(thresh, a, region, current_start, current_end, int(next_line[chrom_hmm_start]), int(next_line[chrom_hmm_end]), BIN_SIZE) if a == "AE" or a == "OE": sum_matrix[0, idx] += int(next_line[chrom_hmm_len]) if ( count_clust == 0) else count_a elif a != "0" and a != "AP" and a != "OP" and a != "GE" and a != "TS": sum_matrix[1, idx] += int(next_line[chrom_hmm_len]) if ( count_clust == 0) else count_a #Add to the total sum if the current start and end are not equal to the previous ones. #if(prev_start != current_start and prev_start != "-1"): cumulative_vec[idx] = np.sum(sum_matrix[:, idx]) #Get the set of percentages. cumulative_matrix = np.tile(cumulative_vec, (2, 1)) return [sum_matrix / cumulative_matrix, np.sum(sum_matrix, 0)]
def get_labels_and_ground_truth(bed_file, sig_file, wig, annotations, threshold): #Set up percentage matrix. vec_pred = list() vec_gt = list() final_stack_pred = np.empty((0, 0)) final_stack_gt = np.empty((0, 0)) #Get scores and labels for each bed file. bed = np.genfromtxt(bed_file, delimiter='\t', dtype=str) sigf = open(sig_file, "r") #Loop through bed file to compute percentage for each region. current_start = -1 current_end = -1 prev_start = -1 prev_end = -1 sig_i = 0 #Do not move forward if the first line is blank in the sig file. try: sig = [float(s) for s in sigf.readline().split(",")] sum_vec = np.zeros(3) #Keep track of regions with no ChromHMM annotations. #These regions will not be used in the analysis. not_annotated_count = 0 count_in_region = 0 for i in range(0, bed.shape[0]): #Get the next element data. next_line = bed[i, :] current_start = int(next_line[1]) current_end = int(next_line[2]) a = next_line[8] anno_start = int(next_line[6]) anno_end = int(next_line[7]) our_anno = next_line[3] anno_length = int(next_line[9]) #Get next signals if needed. #If we are still on the same region, don't get it. if current_start != prev_start: if sig_i != 0: sig_s = sigf.readline() sig = [float(s) for s in sig_s.split(",")] sum_vec = np.zeros(3) sig_i += 1 #Add to the existing percentages. #If the region has peaks, consider only regions above peak threshold. #If no peaks exist, consider entire region. total_peak_size = wsu.count_above(threshold, "", sig, current_start, current_end, current_start, current_end, BIN_SIZE) if a == "1_TssA" or a == "2_TssAFlnk" or a == "10_TssBiv" or a == "11_BivFlnk": if total_peak_size > 0: sum_vec[0] += wsu.count_above(threshold, a, sig, current_start, current_end, anno_start, anno_end, BIN_SIZE) else: sum_vec[0] += anno_length elif a == "6_EnhG" or a == "7_Enh" or a == "12_EnhBiv": if total_peak_size > 0: sum_vec[1] += wsu.count_above(threshold, a, sig, current_start, current_end, anno_start, anno_end, BIN_SIZE) else: sum_vec[1] += anno_length elif a == "9_Het" or a == "15_Quies": if total_peak_size > 0: sum_vec[2] += wsu.count_above(threshold, a, sig, current_start, current_end, anno_start, anno_end, BIN_SIZE) else: sum_vec[2] += anno_length #This case is when there is no annotation. Do not count it. else: not_annotated_count += 1 count_in_region += 1 #Add the ground truth and predicted value based on the region with the maximum count above the threshold. next_start = current_start + 1 if i + 1 < len(bed): next_start = int(bed[i + 1, :][1]) if next_start != current_start and not_annotated_count != count_in_region: #Add another element to the ground truth and prediction vectors. vec_gt.append(np.zeros(len(sum_vec))) vec_pred.append(np.zeros(len(sum_vec))) max = np.argmax(sum_vec) for sum in range(0, len(sum_vec)): #Add ground truth. if sum == max: vec_gt[len(vec_gt) - 1][sum] = 1 else: vec_gt[len(vec_gt) - 1][sum] = 0 #Add predictions. if annotations[sum] == our_anno: vec_pred[len(vec_pred) - 1][sum] = 1 else: vec_pred[len(vec_pred) - 1][sum] = 0 #If it is unknown according to our analysis, do not consider it. #This includes cases where there is no annotation from ChromHMM or where #There is signal above the threshold but no ChromHMM annotation in the signal. if vec_pred[len(vec_pred) - 1][0] == 0 and vec_pred[ len(vec_pred) - 1][1] == 0 and vec_pred[len(vec_pred) - 1][2] == 0: del vec_pred[len(vec_pred) - 1] del vec_gt[len(vec_pred) - 1] elif sum_vec[0] == 0 and sum_vec[1] == 0 and sum_vec[2] == 0: del vec_pred[len(vec_pred) - 1] del vec_gt[len(vec_pred) - 1] #Set count and unannotated count to 0. Do the same for summation vec. not_annotated_count = 0 count_in_region = 0 #Get the previous data, if applicable. prev_start = current_start prev_end = current_end #Stack all values. final_stack_pred = np.stack(vec_pred) final_stack_gt = np.stack(vec_gt) except: pass #Return value. return [final_stack_pred, final_stack_gt]
def get_tss_labels_and_ground_truth(bed_file, sig_file, wig, annotations, threshold): #Set up counts of promoter and non-promoter. vec_pred = list() vec_gt = list() bed = np.genfromtxt(bed_file, delimiter='\t', dtype=str) sigs = np.genfromtxt(sig_file, delimiter=',', dtype=float) #Loop through bed file to compute percentage for each region. current_start = -1 current_end = -1 prev_start = -1 prev_end = -1 sig = sigs[0, :] sig_i = -1 sum_vec = np.zeros(2) #Keep track of regions with no ChromHMM annotations. #These regions will not be used in the analysis. not_annotated_count = 0 count_in_region = 0 for i in range(0, bed.shape[0]): #Get the next element data. next_line = bed[i, :] current_start = int(next_line[1]) current_end = int(next_line[2]) a = next_line[7] anno_start = int(next_line[5]) anno_end = int(next_line[6]) our_anno = next_line[3] anno_length = int(next_line[8]) #Get next signals if needed. #If we are still on the same region, don't get it. if current_start != prev_start: sig_i += 1 sig = sigs[sig_i, :] sum_vec = np.zeros(2) #Add to the existing percentages. #If the region has peaks, consider only regions above peak threshold. #If no peaks exist, consider entire region. total_peak_size = wsu.count_above(threshold, "", sig, current_start, current_end, current_start, current_end, BIN_SIZE) if a == "1_TssA" or a == "2_TssAFlnk" or a == "10_TssBiv" or a == "11_BivFlnk": if total_peak_size > 0: sum_vec[0] += wsu.count_above(threshold, a, sig, current_start, current_end, anno_start, anno_end, BIN_SIZE) else: sum_vec[0] += anno_length elif anno_length != 0: if total_peak_size > 0: sum_vec[1] += wsu.count_above(threshold, a, sig, current_start, current_end, anno_start, anno_end, BIN_SIZE) else: sum_vec[1] += anno_length #This case is when there is no annotation. Do not count it. else: not_annotated_count += 1 count_in_region += 1 #Add the ground truth and predicted value based on the region with the maximum count above the threshold. next_start = current_start + 1 if i + 1 < len(bed): next_start = int(bed[i + 1, :][1]) if next_start != current_start and not_annotated_count != count_in_region: vec_gt.append(np.zeros(len(sum_vec))) vec_pred.append(np.zeros(len(sum_vec))) max = np.argmax(sum_vec) for sum in range(0, len(sum_vec)): #Add ground truth. if sum == max: vec_gt[len(vec_gt) - 1][sum] = 1 else: vec_gt[len(vec_gt) - 1][sum] = 0 #Add predictions. if annotations[sum] == our_anno: vec_pred[len(vec_pred) - 1][sum] = 1 else: vec_pred[len(vec_pred) - 1][sum] = 0 if sum_vec[0] == 0 and sum_vec[1] == 0: del vec_pred[len(vec_pred) - 1] del vec_gt[len(vec_pred) - 1] #Set count and unannotated count to 0. not_annotated_count = 0 count_in_region = 0 #Get the previous data, if applicable. prev_start = current_start prev_end = current_start #Return value. return [np.stack(vec_pred), np.stack(vec_gt)]
def get_all_percentage_pairs(anno, chrom_hmm_anno, start, end, chrom_hmm_start, chrom_hmm_end, chrom_hmm_len, magnitudes, bed, thresh, signals_path): #Set up percentage matrix. sum_matrix = np.zeros((4, len(magnitudes))) cumulative_vec = np.zeros(len(magnitudes)) #Loop through bed file to compute percentage for each region. current_start = -1 current_end = -1 current_clust = "none" prev_start = -1 prev_end = -1 sigs = open(signals_path, "r") junk = 0 for j in range(0, 2): #Loop through BED file twice. next_signal = sigs.readline().split(",") if j == 1: next_signal = sigs.readline().split(",") for i in range(0, bed.shape[0]): if len(next_signal) > 1: #Get the previous data, if applicable. if i > 0: prev_start = int(bed[i - 1, start]) prev_end = int(bed[i - 1, end]) #Get the next element data. next_line = bed[i, :] current_start = int(next_line[start]) current_end = int(next_line[end]) current_clust = next_line[anno] a = next_line[chrom_hmm_anno] idx = magnitudes.index(current_clust) #Get the signal data. if (prev_start >= int(next_signal[1]) or current_start > int( next_signal[1])) and current_start > prev_start: next_signal = sigs.readline().split(",") if len(next_signal) > 1: if current_start > int(next_signal[1]): next_signal = sigs.readline().split(",") if len(next_signal) > 1: if int(next_signal[1]) == current_start: #Add to the existing percentages. region = [ float(i) for i in next_signal[3:len(next_signal)] ] count_a = wsu.count_above( thresh, a, region, current_start, current_end, int(next_line[chrom_hmm_start]), int(next_line[chrom_hmm_end]), BIN_SIZE) if a == "1_TssA" or a == "2_TssAFlnk" or a == "10_TssBiv" or a == "11_BivFlnk": sum_matrix[0, idx] += int( next_line[chrom_hmm_len] ) if (int(current_clust) < thresh) else count_a elif a == "6_EnhG" or a == "7_Enh" or a == "12_EnhBiv": sum_matrix[1, idx] += int( next_line[chrom_hmm_len] ) if (int(current_clust) < thresh) else count_a elif a == "13_ReprPC" or a == "ReprPCWk": sum_matrix[2, idx] += int( next_line[chrom_hmm_len] ) if (int(current_clust) < thresh) else count_a elif a == "9_Het" or a == "15_Quies": sum_matrix[3, idx] += int( next_line[chrom_hmm_len] ) if (int(current_clust) < thresh) else count_a #Add to the total sum if the current start and end are not equal to the previous ones. #if(prev_start != current_start and prev_start != "-1"): cumulative_vec[idx] = np.sum(sum_matrix[:, idx]) #Get the set of percentages. cumulative_matrix = np.tile(cumulative_vec, (4, 1)) return [sum_matrix / cumulative_matrix, np.sum(sum_matrix, 0)]