def parse_repseek_output(infile, intervals): tree = get_window_tree(intervals) repeat_dict = {} with open(infile, 'r') as fin: for line in fin: fields = line.strip().split('\t') type = fields[0] pos1 = int(fields[1]) pos2 = int(fields[2]) len1 = int(fields[3]) len2 = int(fields[4]) spacer = int(fields[5]) seed = fields[6] identity = float(fields[7]) score = float(fields[8]) meanR = float(fields[9]) modeR = float(fields[10]) fraction = float(fields[11]) # for a distant repeat, find its neighbor GIs if spacer >= 4500 and spacer <= 600000: end = pos2+len2 overlap = find(pos1, end, tree) #line='%s\t%s\t%s\n' % () if len(overlap) > 0: for intv in overlap: istart, iend, iscore = intv if abs(istart-pos1)<3000 and abs(iend-end)<3000: if intv not in repeat_dict.keys(): repeat_dict[intv]=[(type, pos1, end, len1, len2)] else: repeat_dict.setdefault(intv,[]).append((type, pos1, end, len1, len2)) return repeat_dict
def parse_trnas(infile, intervals, offset): tree = get_window_tree(intervals) trna_dict = {} with open(infile, 'r') as fin: for line in fin: fields = line.strip().split('\t') # Suppose the start and end positions are in 3rd and 4th columns pos1 = int(fields[2]) pos2 = int(fields[3]) if pos1 > pos2: tstart = pos2 tend = pos1 else: tstart = pos1 tend = pos2 # Extend search region in case a tRNA is nearby overlap = find(tstart - offset, tend + offset, tree) if len(overlap) > 0: for intv in overlap: istart, iend, iscore = intv if intv not in trna_dict.keys(): trna_dict[intv] = [(pos1, pos2)] else: trna_dict.setdefault(intv, []).append((pos1, pos2)) return trna_dict
def parse_trnas_contigs(infile, id_mapping, intervals, offset): trna_dict = {} with open(infile, 'r') as fin: for line in fin: fields = line.strip().split('\t') # Suppose the start and end positions are in 3rd and 4th columns name = fields[0].strip() id = id_mapping[name] pos1 = int(fields[2]) pos2 = int(fields[3]) if pos1 > pos2: tstart = pos2 tend = pos1 else: tstart = pos1 tend = pos2 tree = get_window_tree(intervals[id]) # Extend search region in case a tRNA is nearby overlap = find(tstart - offset, tend + offset, tree) if len(overlap) > 0: for intv in overlap: istart, iend, iscore = intv key = (str(id) + '_' + str(istart), str(id) + '_' + str(iend)) if intv not in trna_dict.keys(): trna_dict[key] = [(pos1, pos2)] else: trna_dict.setdefault(key, []).append((pos1, pos2)) return trna_dict
def extend_boundary_contig(intervals, genes_dict): # build an interval to facilitate querying tree_dict = {} for cid, genes in genes_dict.items(): tree = get_window_tree(genes) tree_dict[cid] = tree new_intervals = { } # Use dicionary to facilitate the merging of overlapping regions for p1, p2 in intervals: # Suppose the format for start/end is cid_coord mark = p1.index('_') start = int(p1[mark + 1:]) end = int(p2[mark + 1:]) contig_id = int(p1[0:mark]) tree = tree_dict[contig_id] overlap = find(start, end, tree) if len(overlap) > 0: # find the boundary coordinates of the intervals print('intervals with overlapping:') print(p1) print(p2) print(overlap) sorted_overlap = sorted(overlap, key=lambda x: (int(x[0]), int(x[1]))) ostart = sorted_overlap[0][0] oend = sorted_overlap[-1][1] intv_size = sorted_overlap[0][1] - sorted_overlap[0][0] + 1 hang_size = start - ostart + 1 intv_size1 = sorted_overlap[-1][1] - sorted_overlap[-1][0] + 1 hang_size1 = oend - end + 1 if ostart < start and hang_size < intv_size / 2: # More than half of the gene is outside this region nstart = ostart else: nstart = start if oend > end and hang_size1 < intv_size1 / 2: nend = oend else: nend = end coord = (nstart, nend) # ns = str(contig_id) + '_' + str(nstart) # ne = str(contig_id) + '_' + str(nend) # new_intervals.append((ns, ne)) else: # new_intervals.append((p1, p2)) coord = (start, end) if contig_id not in new_intervals.keys(): new_intervals[contig_id] = [coord] else: new_intervals.setdefault(contig_id, []).append(coord) return new_intervals
def getCommonIntervals(interval1, interval2, fraction): ''' find common intervals in both lists of intervals extract the common regions ''' if len(interval1) == 0 or len(interval2) == 0: return [] tree = get_window_tree(interval2) overlap_intervals = [] overlap_interval1 = [] overlap_interval2 = [] # The number of reference intervals with no overlapping query intervals num_nooverlap = 0 # for start, end, other in interval1: for start, end in interval1: # find all genes in a reference intervals # find all query intervals overlapping with the reference overlap = find(start, end, tree) if len( overlap ) == 0: # no query intervals overlapping with the reference interval num_nooverlap += 1 for interval in overlap: o_start = max(interval[0], start) o_end = min(interval[1], end) o_size = o_end - o_start + 1 overlap_intervals.append((o_start, o_end, o_size)) ref_size = interval[1] - interval[0] + 1 if o_size > fraction * ref_size: # not store size to facilitate set intersection overlap_interval1.append((start, end)) overlap_interval2.append((interval[0], interval[1])) return set(overlap_intervals), set(overlap_interval1), set( overlap_interval2)
def extend_boundary(intervals, genes): # build an interval to facilitate querying tree = get_window_tree(genes) new_intervals = [] for start, end in intervals: overlap = find(start, end, tree) if len(overlap) > 0: # find the boundary coordinates of the intervals sorted_overlap = sorted(overlap, key=lambda x: (int(x[0]), int(x[1]))) ostart = sorted_overlap[0][0] oend = sorted_overlap[-1][1] intv_size = sorted_overlap[0][1] - sorted_overlap[0][0] + 1 hang_size = start - ostart + 1 intv_size1 = sorted_overlap[-1][1] - sorted_overlap[-1][0] + 1 hang_size1 = oend - end + 1 if ostart < start and hang_size < intv_size / 2: nstart = ostart else: nstart = start if oend > end and hang_size1 < intv_size1 / 2: nend = oend else: nend = end new_intervals.append((nstart, nend)) # intersects = [] # for ol in overlap: # intersects.append(ol[0]) # intersects.append(ol[1]) # minCoord = min(intersects) # maxCoord = max(intersects) else: new_intervals.append((start, end)) return new_intervals
def getMetric_base(ref_intervals, query_intervals, genelist, options): ''' For evaluation of the predictions of intervals based on nucleotides or genes. A given percentage is used to determine whether a reference interval (genomic island or gene) is detected. If only a base of an interval is detected, it is meaningless to classify this interval as predicted. ''' print '(start, end, size)\tleft_offset\tright_offset\toverlap_regions\tpredicted_size\toverlap_percentage\toverlap_percentage_pred\tnum_reference_genes\tnum_predicted_genes\tnum_overlap_genes\toverlap_gene_percentage\toverlap_gene_percentage_pred\tgaps' tree = get_window_tree(query_intervals) avg_query_len = get_interval_length(query_intervals) / len(query_intervals) # The list of query intervals overlapping with the reference # Use list for convenience in inserting, convert to set later in case some intervals overlapping with multiple reference intervals overlap_intervals = [] # The number of reference intervals with no overlapping query intervals num_nooverlap = 0 # For evaluation based on bases # The total size of all the reference intervals ref_totalSize = 0 overlap_totalSize = 0 # To record the boundary offset for each reference interval extentions = [] foffset = '' if options.output: offsetfile = options.output + '_offset' foffset = open(offsetfile, 'w') # For evaluation based on genes if options.pttfile or options.gene_list: overlap_total_genes = set() ref_total_genes = set() tp_interval = 0 for start, end in ref_intervals: # Find all query intervals overlapping with the reference (even 1 bp overlap is counted here) overlap = find(start, end, tree) if len( overlap ) == 0: # No query intervals overlapping with the reference interval num_nooverlap += 1 # Update overlap_intervals for all reference intervals overlap_intervals.extend(overlap) ################################## boundary offset ###################################################### offset_line_str = '' offset_line = [] ''' The predicted interval may be much larger than the reference interval. Compute the extended region for FPs to check boundary accuracy. There maybe several predicted intervals, only the boundary intervals need to be checked ''' sorted_overlap = sorted(overlap, key=lambda x: (int(x[0]), int(x[1]))) left_ext = 0 right_ext = 0 if (len(sorted_overlap) > 0): # negative number represents predicted interval crossing the reference region left_ext = sorted_overlap[0][0] - start right_ext = end - sorted_overlap[-1][1] # If we only count boundary error when the overlap is large enough, comment out this statement extentions.append((abs(left_ext), abs(right_ext))) offset_line_str += '%s\t%s\t%s\t%s' offset_line.extend([start, end, left_ext, right_ext]) ##################################### gene metrics #################################################### if options.pttfile or options.gene_list: ref_genes = getGenesInInterval((start, end), genelist, options.cutoff_gene) num_refgenes = len(ref_genes) ref_total_genes.update(ref_genes) # There may be overlap if some genes are counted in both intervals overlap_genes = set() # The number of genes in the predicted intervals, to get an idea of over/under estimation predicted_genes = set() for interval in overlap: genes = getGenesInInterval(interval, genelist, options.cutoff_gene) # Output genes across the boundary of all predicted intervals if len(genes) > 0: sorted_genes = sorted(genes, key=lambda x: (int(x[0]), int(x[1]))) offset_line.extend([sorted_genes[0], sorted_genes[-1]]) offset_line_str += '\t%s\t%s' overlap_gene = set(ref_genes).intersection(genes) # For this ref interval predicted_genes.update(genes) overlap_genes.update(overlap_gene) # For all the ref intervals overlap_total_genes.update(overlap_genes) num_overlapgenes = len(overlap_genes) # num_predictedgenes > num_overlapgenes, since genes in the predicted interval may not overlap with reference intervals num_predictedgenes = len(predicted_genes) if num_refgenes > 0: overlap_gene_percentage = round( num_overlapgenes * 100 / num_refgenes, 3) else: overlap_gene_percentage = 0 if num_predictedgenes > 0: overlap_gene_percentage_pred = round( num_overlapgenes * 100 / num_predictedgenes, 3) else: overlap_gene_percentage_pred = 0 ##################################### base metrics #################################################### # Check the coverage of the overlap overlap_bases = 0 # Record the intervals of reference and query to get the union of overlapping bases overlap_interval_list = [] for interval in overlap: # overlap_interval is a tuple overlap_interval = get_overlap_interval(interval, (start, end)) if overlap_interval is not None: overlap_interval_list.append(overlap_interval) if foffset != '': offset_line_str += '\n' foffset.write(offset_line_str % tuple(offset_line)) if len(overlap_interval_list) > 0: overlap_bases = getOverlapIntervalSize(overlap_interval_list) refsize = int(end) - int(start) + 1 ref_totalSize += refsize # only count overlap if the reference interval are counted as found if overlap_bases > options.cutoff_base * refsize: overlap_totalSize += overlap_bases tp_interval += 1 # not enough overlap else: for interval in overlap: overlap_intervals.remove(interval) # The fraction of a reference interval covered by all the query interval overlapping with it overlap_percentage = round(overlap_bases * 100 / refsize, 3) line = [start, end, (end - start + 1), left_ext, right_ext] overlap_len = len(sorted_overlap) line_str = '(%s, %s, %s)\t%d\t%d' # Compute the gap between intervals when there are more than two intervals suffix_str = ';%s' query_size = 0 for i, overlap in enumerate(sorted_overlap): if i == 0: line_str += '\t%s' else: line_str += suffix_str line.append(overlap) query_size += overlap[1] - overlap[0] + 1 line_str += '\t%s' line.append(query_size) # The fraction of a reference interval covered by all the query interval overlapping with it if query_size > 0: overlap_percentage_pred = round(overlap_bases * 100 / query_size, 3) else: overlap_percentage_pred = 0 gaps = [] if overlap_len > 1: p1 = sorted_overlap[0][1] for overlap in sorted_overlap[1:]: p2 = overlap[0] # 2-1 = 1, but they are adjacent gap = p2 - p1 - 1 gaps.append(gap) p1 = overlap[1] # Find all genes in a reference interval if options.pttfile or options.gene_list: # 8 fields line_str += '\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s' line.extend([ overlap_percentage, overlap_percentage_pred, num_refgenes, num_predictedgenes, num_overlapgenes, overlap_gene_percentage, overlap_gene_percentage_pred, gaps ]) else: line_str += '\t%s\t%s\t%s' line.extend([overlap_percentage, overlap_percentage_pred, gaps]) print line_str % tuple(line) # The number of all the predicted intervals overlapping with the reference num_overlap = len(set(overlap_intervals)) print '\nThe number of predicted intervals: %s' % len(query_intervals) print 'The number of reference intervals: %s' % len(ref_intervals) print 'The number of predicted reference intervals (TPs, at least certain fraction (0.4 by default) of the reference interval is predicted): %s' % tp_interval print 'The number of unpredicted reference intervals (FNs): %s' % ( len(ref_intervals) - tp_interval) print 'The number of reference intervals not overlapping with predictions: %s' % num_nooverlap # Some intervals may be overlapped with different reference intervals, so this number may be overestimated print 'The number of predicted intervals overlapping with the reference: %s' % num_overlap # The query intervals not overlapping with the reference unique_intervals = set(query_intervals) - set(overlap_intervals) print 'The number of predicted intervals not overlapping with the reference (FPs): %s' % len( unique_intervals) ############################## PR in #intervals ############################################### ''' This is in term of #intervals. Not meaningful. For reference. ''' # recall: TP/(TP+FN), precision= TP/(TP+FP) tp = tp_interval # The number of predicted intervals not overlapping with the reference, hard to be mapped to reference intervals # fp = len(query_intervals) - tp fp = len(unique_intervals) # By definition, FN should be the number of unpredicted intervals overlapping with the reference interval # Here, for convenience, it is the number of unpredicted reference intervals fn = len(ref_intervals) - tp real = len(ref_intervals) predicted = len(set(query_intervals)) ''' When using real, recall may be larger than 1, as many predicted intervals may overlap with the same GI. So we use tp+fn instead. ''' recall = tp / (tp + fn) precision = tp / (tp + fp) if recall != 0 and precision != 0: fmeasure = 2 * recall * precision / (recall + precision) else: fmeasure = 0 print 'Interval Recall: %.3f\tPrecision: %.3f\tF-measure: %.3f\tPredicted intervals: %s\tTotal intervals: %s\t' % ( recall, precision, fmeasure, predicted, real) ############################## PR in #overlap bases ############################################### ''' This is in term of overlapping bases. ''' tp = overlap_totalSize real = get_interval_length(ref_intervals) # Two alternative ways to get the number of bases in reference intervals assert real == ref_totalSize # Merge before counting as query_intervals may be overlapping predicted = getOverlapIntervalSize(query_intervals) recall = tp / real precision = tp / predicted if recall != 0 and precision != 0: fmeasure = 2 * recall * precision / (recall + precision) else: fmeasure = 0 # Append offset error at the end of last line (avg_left, avg_right, avg_offset) = getAvgBoundaryError(extentions) print 'The number of reference bases: %d' % real ############################## PR in #overlap genes ############################################### ''' This is in term of overlapping genes. ''' if options.pttfile or options.gene_list: tp = len(set(overlap_total_genes)) g_real = len(set(ref_total_genes)) query_total_genes = set() for query_interval in query_intervals: query_genes = getGenesInInterval(query_interval, genelist, options.cutoff_gene) query_total_genes.update(query_genes) g_predicted = len(query_total_genes) g_recall = tp / g_real g_precision = tp / g_predicted if g_recall != 0 and g_precision != 0: g_fmeasure = 2 * g_recall * g_precision / (g_recall + g_precision) else: g_fmeasure = 0 diff_fmeasure = g_fmeasure - fmeasure print 'The number of reference genes: %d' % g_real # Other measures fn = g_real - tp fp = g_predicted - tp neg = len(genelist) - g_real tn = neg - fp tnr = (tn) / (tn + fp) oacc = (tp + tn) / (tp + tn + fn + fp) acc = (g_recall + tnr) / 2 mcc = (tp * tn - fp * fn) / math.sqrt( (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn)) if options.pttfile or options.gene_list: format_str = 'Bases Recall: %.3f\tPrecision: %.3f\tF-measure: %.3f\tF-measure Difference: %.3f\tLeft offset: %d\tRight: %d\tPredicted bases: %d\tOverlap bases: %d\tAverage interval size: %d\tOverlap genes: %d\tPredicted genes: %d\tPredicted intervals: %d\tGene Recall: %.3f\tPrecision: %.3f\tF-measure: %.3f\tAvg offset: %d\tTNR: %.3f\tOACC: %.3f\tACC: %.3f\tMCC: %.3f' print format_str % ( recall, precision, fmeasure, diff_fmeasure, avg_left, avg_right, predicted, overlap_totalSize, avg_query_len, len(overlap_total_genes), g_predicted, len(query_intervals), g_recall, g_precision, g_fmeasure, avg_offset, tnr, oacc, acc, mcc) else: format_str = 'Bases Recall: %.3f\tPrecision: %.3f\tF-measure: %.3f\tLeft offset: %d\tRight: %d\tPredicted bases: %d\tOverlap bases: %d\tPredicted intervals: %d\tAverage interval size: %d\tAvg offset: %d' print format_str % (recall, precision, fmeasure, avg_left, avg_right, predicted, overlap_totalSize, len(query_intervals), avg_query_len, avg_offset) # Output FP predictions if options.overlap: # Sort intervals overlap_intervals = sorted(set(overlap_intervals), key=lambda x: (int(x[0]), int(x[1]))) writeListOfTupleToFile(options.overlap, overlap_intervals) if options.output: unique_intervals = sorted(unique_intervals, key=lambda x: (int(x[0]), int(x[1]))) writeListOfTupleToFile(options.output, unique_intervals) if foffset != '': foffset.close() return (recall, precision, fmeasure, options.output)
def get_overlap_statistics(ref_intervals, query_intervals, genelist, options): ''' For each reference interval, check the query intervals that overlap with it ''' overlap_intervals = {} unique_intervals = {} extentions_dict = {} # The number of reference intervals with no overlapping query intervals num_nooverlap = 0 ref_totalSize = 0 overlap_totalSize = 0 tp_interval = 0 for id, r_intervals in ref_intervals.items(): # Find query intervals with the same ID if id not in query_intervals.keys(): continue q_intervals = query_intervals[id] tree = get_window_tree(q_intervals) overlap_intervals[id] = [] unique_intervals[id] = [] extentions = [] for start, end, size in r_intervals: # Find all query intervals overlapping with the reference overlap = find(start, end, tree) if len( overlap ) == 0: # There is no query intervals overlapping with a reference interval num_nooverlap += 1 # Update overlap_intervals for all reference intervals overlap_intervals[id].extend(overlap) ################################## boundary offset ###################################################### sorted_overlap = sorted(overlap, key=lambda x: (int(x[0]), int(x[1]))) left_ext = 0 right_ext = 0 if (len(sorted_overlap) > 0): left_ext = sorted_overlap[0][0] - start right_ext = end - sorted_overlap[-1][1] extentions.append((abs(left_ext), abs(right_ext))) ##################################### base metrics #################################################### # Check the coverage of the overlap overlap_bases = 0 # Record the intervals of reference and query to get the union of overlapping bases overlap_interval_list = [] for interval in overlap: overlap_interval = get_overlap_interval(interval, (start, end)) if overlap_interval is not None: overlap_interval_list.append(overlap_interval) if len(overlap_interval_list) > 0: # Suppose intervals in overlap_interval_list do not overlap overlap_bases = get_interval_length(overlap_interval_list) refsize = int(end) - int(start) + 1 ref_totalSize += refsize # only count overlap if the reference interval are counted as found if overlap_bases > options.cutoff_base * refsize: overlap_totalSize += overlap_bases tp_interval += 1 # not enough overlap else: for interval in overlap: overlap_intervals[id].remove(interval) # The fraction of a reference interval covered by all the query interval overlapping with it overlap_percentage = round(overlap_bases * 100 / refsize, 3) suffix_str = '\t%s' line = [id, start, end, (end - start + 1), left_ext, right_ext] overlap_len = len(sorted_overlap) line_str = '%d\t(%s, %s, %s)\t%d\t%d' # Compute the gap between intervals when there are more than two intervals for overlap in sorted_overlap: line_str += suffix_str line.append(overlap) gaps = [] if overlap_len > 1: p1 = sorted_overlap[0][1] for overlap in sorted_overlap[1:]: p2 = overlap[0] gap = p2 - p1 - 1 gaps.append(gap) p1 = overlap[1] # Find all genes in a reference interval if options.genefile: line_str += '\t%s\t%s\t%s\t%s\t%s\t%s' line.extend([ overlap_percentage, num_refgenes, num_predictedgenes, num_overlapgenes, overlap_gene_percentage, gaps ]) else: line_str += '\t%s\t%s' line.extend([overlap_percentage, gaps]) print line_str % tuple(line) unique_intervals[id] = list( set(query_intervals[id]) - set(overlap_intervals[id])) extentions_dict[id] = extentions # For all the contigs # The number of all the predicted intervals overlapping with the reference # Note: all the intervals are dictionaries num_overlap = sum(len(v) for v in overlap_intervals.itervalues()) num_ref = sum(len(v) for v in ref_intervals.itervalues()) num_pred = sum(len(v) for v in query_intervals.itervalues()) for id in query_intervals.keys(): if id not in unique_intervals.keys(): unique_intervals[id] = query_intervals[id] num_uniq = sum(len(v) for v in unique_intervals.itervalues()) print '\nThe number of predicted intervals: %s' % num_pred print 'The number of reference intervals: %s' % num_ref print 'The number of predicted reference intervals (TPs): %s' % tp_interval print 'The number of unpredicted reference intervals (FNs): %s' % ( num_ref - tp_interval) print 'The number of reference intervals not overlapping with predictions: %s' % num_nooverlap # Some intervals may be overlapped with different reference intervals, so this number may be overestimated print 'The number of predicted intervals overlapping with the reference: %s' % num_overlap print 'The number of predicted intervals not overlapping with the reference (FPs): %s' % num_uniq # # ############################## PR in #overlap bases ############################################### tp = overlap_totalSize real = 0 for id in ref_intervals.keys(): real += get_interval_length(ref_intervals[id]) # Two alternative ways to get the number of bases in reference intervals assert real == ref_totalSize # Merge before counting as query_intervals may be overlapping predicted = 0 for id in query_intervals.keys(): predicted += getOverlapIntervalSize(query_intervals[id]) recall = tp / real precision = tp / predicted if recall != 0 and precision != 0: fmeasure = 2 * recall * precision / (recall + precision) else: fmeasure = 0 # Append offset error at the end of last line lavg = 0 ravg = 0 oavg = 0 count_ext = 0 for id, extentions in extentions_dict.items(): (avg_left, avg_right, avg_offset) = getAvgBoundaryError(extentions) lavg += avg_left ravg += avg_right oavg += avg_offset count_ext += 1 avg_offset_all = oavg / count_ext avg_left_all = lavg / count_ext avg_right_all = ravg / count_ext avg_size = predicted / num_pred print 'The number of reference bases: %d' % real format_str = 'Bases Recall: %.3f\tPrecision: %.3f\tF-measure: %.3f\tLeft offset: %d\tRight: %d\tPredicted bases: %d\tOverlap bases: %d\tPredicted intervals: %d\tAverage interval size: %d\tAvg offset: %d' print format_str % (recall, precision, fmeasure, avg_left_all, avg_right_all, predicted, overlap_totalSize, num_pred, avg_size, avg_offset_all)