コード例 #1
0
def parse_repseek_output(infile, intervals):
    tree = get_window_tree(intervals)
    repeat_dict = {}

    with open(infile, 'r') as fin:
        for line in fin:
            fields = line.strip().split('\t')
            type = fields[0]
            pos1 = int(fields[1])
            pos2 = int(fields[2])
            len1 = int(fields[3])
            len2 = int(fields[4])
            spacer = int(fields[5])
            seed = fields[6]
            identity = float(fields[7])
            score = float(fields[8])
            meanR = float(fields[9])
            modeR = float(fields[10])
            fraction = float(fields[11])
            # for a distant repeat, find its neighbor GIs
            if spacer >= 4500 and spacer <= 600000:
                end = pos2+len2
                overlap = find(pos1, end, tree)
                #line='%s\t%s\t%s\n' % ()
                if len(overlap) > 0:
                    for intv in overlap:
                        istart, iend, iscore = intv
                        if abs(istart-pos1)<3000 and abs(iend-end)<3000:
                            if intv not in repeat_dict.keys():
                                repeat_dict[intv]=[(type, pos1, end, len1, len2)]
                            else:
                                repeat_dict.setdefault(intv,[]).append((type, pos1, end, len1, len2))
    return repeat_dict
コード例 #2
0
def parse_trnas(infile, intervals, offset):
    tree = get_window_tree(intervals)
    trna_dict = {}

    with open(infile, 'r') as fin:
        for line in fin:
            fields = line.strip().split('\t')
            # Suppose the start and end positions are in 3rd and 4th columns
            pos1 = int(fields[2])
            pos2 = int(fields[3])
            if pos1 > pos2:
                tstart = pos2
                tend = pos1
            else:
                tstart = pos1
                tend = pos2
            # Extend search region in case a tRNA is nearby
            overlap = find(tstart - offset, tend + offset, tree)
            if len(overlap) > 0:
                for intv in overlap:
                    istart, iend, iscore = intv
                    if intv not in trna_dict.keys():
                        trna_dict[intv] = [(pos1, pos2)]
                    else:
                        trna_dict.setdefault(intv, []).append((pos1, pos2))
    return trna_dict
コード例 #3
0
def parse_trnas_contigs(infile, id_mapping, intervals, offset):
    trna_dict = {}

    with open(infile, 'r') as fin:
        for line in fin:
            fields = line.strip().split('\t')
            # Suppose the start and end positions are in 3rd and 4th columns
            name = fields[0].strip()
            id = id_mapping[name]
            pos1 = int(fields[2])
            pos2 = int(fields[3])
            if pos1 > pos2:
                tstart = pos2
                tend = pos1
            else:
                tstart = pos1
                tend = pos2

            tree = get_window_tree(intervals[id])
            # Extend search region in case a tRNA is nearby
            overlap = find(tstart - offset, tend + offset, tree)
            if len(overlap) > 0:
                for intv in overlap:
                    istart, iend, iscore = intv
                    key = (str(id) + '_' + str(istart), str(id) + '_' + str(iend))
                    if intv not in trna_dict.keys():
                        trna_dict[key] = [(pos1, pos2)]
                    else:
                        trna_dict.setdefault(key, []).append((pos1, pos2))

    return trna_dict
コード例 #4
0
def extend_boundary_contig(intervals, genes_dict):
    # build an interval to facilitate querying
    tree_dict = {}
    for cid, genes in genes_dict.items():
        tree = get_window_tree(genes)
        tree_dict[cid] = tree
    new_intervals = {
    }  # Use dicionary to facilitate the merging of overlapping regions
    for p1, p2 in intervals:
        # Suppose the format for start/end is cid_coord
        mark = p1.index('_')
        start = int(p1[mark + 1:])
        end = int(p2[mark + 1:])
        contig_id = int(p1[0:mark])
        tree = tree_dict[contig_id]
        overlap = find(start, end, tree)
        if len(overlap) > 0:
            # find the boundary coordinates of the intervals
            print('intervals with overlapping:')
            print(p1)
            print(p2)
            print(overlap)
            sorted_overlap = sorted(overlap,
                                    key=lambda x: (int(x[0]), int(x[1])))
            ostart = sorted_overlap[0][0]
            oend = sorted_overlap[-1][1]
            intv_size = sorted_overlap[0][1] - sorted_overlap[0][0] + 1
            hang_size = start - ostart + 1
            intv_size1 = sorted_overlap[-1][1] - sorted_overlap[-1][0] + 1
            hang_size1 = oend - end + 1
            if ostart < start and hang_size < intv_size / 2:  # More than half of the gene is outside this region
                nstart = ostart
            else:
                nstart = start
            if oend > end and hang_size1 < intv_size1 / 2:
                nend = oend
            else:
                nend = end
            coord = (nstart, nend)
            # ns = str(contig_id) + '_' + str(nstart)
            # ne = str(contig_id) + '_' + str(nend)
            # new_intervals.append((ns, ne))
        else:
            # new_intervals.append((p1, p2))
            coord = (start, end)
        if contig_id not in new_intervals.keys():
            new_intervals[contig_id] = [coord]
        else:
            new_intervals.setdefault(contig_id, []).append(coord)
    return new_intervals
コード例 #5
0
def getCommonIntervals(interval1, interval2, fraction):
    '''
    find common intervals in both lists of intervals
    extract the common regions
    '''
    if len(interval1) == 0 or len(interval2) == 0:
        return []

    tree = get_window_tree(interval2)

    overlap_intervals = []
    overlap_interval1 = []
    overlap_interval2 = []
    # The number of reference intervals with no overlapping query intervals
    num_nooverlap = 0

    # for start, end, other in interval1:
    for start, end in interval1:
        # find all genes in a reference intervals
        # find all query intervals overlapping with the reference
        overlap = find(start, end, tree)
        if len(
                overlap
        ) == 0:  # no query intervals overlapping with the reference interval
            num_nooverlap += 1

        for interval in overlap:
            o_start = max(interval[0], start)
            o_end = min(interval[1], end)
            o_size = o_end - o_start + 1
            overlap_intervals.append((o_start, o_end, o_size))
            ref_size = interval[1] - interval[0] + 1
            if o_size > fraction * ref_size:
                # not store size to facilitate set intersection
                overlap_interval1.append((start, end))
                overlap_interval2.append((interval[0], interval[1]))

    return set(overlap_intervals), set(overlap_interval1), set(
        overlap_interval2)
コード例 #6
0
def extend_boundary(intervals, genes):
    # build an interval to facilitate querying
    tree = get_window_tree(genes)
    new_intervals = []
    for start, end in intervals:
        overlap = find(start, end, tree)
        if len(overlap) > 0:
            # find the boundary coordinates of the intervals
            sorted_overlap = sorted(overlap,
                                    key=lambda x: (int(x[0]), int(x[1])))
            ostart = sorted_overlap[0][0]
            oend = sorted_overlap[-1][1]
            intv_size = sorted_overlap[0][1] - sorted_overlap[0][0] + 1
            hang_size = start - ostart + 1
            intv_size1 = sorted_overlap[-1][1] - sorted_overlap[-1][0] + 1
            hang_size1 = oend - end + 1
            if ostart < start and hang_size < intv_size / 2:
                nstart = ostart
            else:
                nstart = start
            if oend > end and hang_size1 < intv_size1 / 2:
                nend = oend
            else:
                nend = end
            new_intervals.append((nstart, nend))


#             intersects = []
#             for ol in overlap:
#                intersects.append(ol[0])
#                intersects.append(ol[1])
#             minCoord = min(intersects)
#             maxCoord = max(intersects)
        else:
            new_intervals.append((start, end))
    return new_intervals
コード例 #7
0
def getMetric_base(ref_intervals, query_intervals, genelist, options):
    '''
    For evaluation of the predictions of intervals based on nucleotides or genes.
    A given percentage is used to determine whether a reference interval (genomic island or gene) is detected.
    If only a base of an interval is detected, it is meaningless to classify this interval as predicted.
    '''
    print '(start, end, size)\tleft_offset\tright_offset\toverlap_regions\tpredicted_size\toverlap_percentage\toverlap_percentage_pred\tnum_reference_genes\tnum_predicted_genes\tnum_overlap_genes\toverlap_gene_percentage\toverlap_gene_percentage_pred\tgaps'
    tree = get_window_tree(query_intervals)
    avg_query_len = get_interval_length(query_intervals) / len(query_intervals)
    # The list of query intervals overlapping with the reference
    # Use list for convenience in inserting, convert to set later in case some intervals overlapping with multiple reference intervals
    overlap_intervals = []
    # The number of reference intervals with no overlapping query intervals
    num_nooverlap = 0

    # For evaluation based on bases
    # The total size of all the reference intervals
    ref_totalSize = 0
    overlap_totalSize = 0
    # To record the boundary offset for each reference interval
    extentions = []
    foffset = ''
    if options.output:
        offsetfile = options.output + '_offset'
        foffset = open(offsetfile, 'w')

    # For evaluation based on genes
    if options.pttfile or options.gene_list:
        overlap_total_genes = set()
        ref_total_genes = set()
    tp_interval = 0

    for start, end in ref_intervals:
        # Find all query intervals overlapping with the reference (even 1 bp overlap is counted here)
        overlap = find(start, end, tree)
        if len(
                overlap
        ) == 0:  # No query intervals overlapping with the reference interval
            num_nooverlap += 1
        # Update overlap_intervals for all reference intervals
        overlap_intervals.extend(overlap)

        ################################## boundary offset ######################################################
        offset_line_str = ''
        offset_line = []
        '''
        The predicted interval may be much larger than the reference interval.
        Compute the extended region for FPs to check boundary accuracy.
        There maybe several predicted intervals, only the boundary intervals need to be checked
        '''
        sorted_overlap = sorted(overlap, key=lambda x: (int(x[0]), int(x[1])))
        left_ext = 0
        right_ext = 0
        if (len(sorted_overlap) > 0):
            # negative number represents predicted interval crossing the reference region
            left_ext = sorted_overlap[0][0] - start
            right_ext = end - sorted_overlap[-1][1]
            # If we only count boundary error when the overlap is large enough, comment out this statement
            extentions.append((abs(left_ext), abs(right_ext)))
            offset_line_str += '%s\t%s\t%s\t%s'
            offset_line.extend([start, end, left_ext, right_ext])

        ##################################### gene metrics ####################################################
        if options.pttfile or options.gene_list:
            ref_genes = getGenesInInterval((start, end), genelist,
                                           options.cutoff_gene)
            num_refgenes = len(ref_genes)
            ref_total_genes.update(ref_genes)
            # There may be overlap if some genes are counted in both intervals
            overlap_genes = set()
            # The number of genes in the predicted intervals, to get an idea of over/under estimation
            predicted_genes = set()

            for interval in overlap:
                genes = getGenesInInterval(interval, genelist,
                                           options.cutoff_gene)
                # Output genes across the boundary of all predicted intervals
                if len(genes) > 0:
                    sorted_genes = sorted(genes,
                                          key=lambda x: (int(x[0]), int(x[1])))
                    offset_line.extend([sorted_genes[0], sorted_genes[-1]])
                    offset_line_str += '\t%s\t%s'
                overlap_gene = set(ref_genes).intersection(genes)
                # For this ref interval
                predicted_genes.update(genes)
                overlap_genes.update(overlap_gene)
                # For all the ref intervals
                overlap_total_genes.update(overlap_genes)

            num_overlapgenes = len(overlap_genes)
            # num_predictedgenes > num_overlapgenes, since genes in the predicted interval may not overlap with reference intervals
            num_predictedgenes = len(predicted_genes)

            if num_refgenes > 0:
                overlap_gene_percentage = round(
                    num_overlapgenes * 100 / num_refgenes, 3)
            else:
                overlap_gene_percentage = 0

            if num_predictedgenes > 0:
                overlap_gene_percentage_pred = round(
                    num_overlapgenes * 100 / num_predictedgenes, 3)
            else:
                overlap_gene_percentage_pred = 0

        ##################################### base metrics ####################################################
        # Check the coverage of the overlap
        overlap_bases = 0
        # Record the intervals of reference and query to get the union of overlapping bases
        overlap_interval_list = []
        for interval in overlap:
            # overlap_interval is a tuple
            overlap_interval = get_overlap_interval(interval, (start, end))
            if overlap_interval is not None:
                overlap_interval_list.append(overlap_interval)

        if foffset != '':
            offset_line_str += '\n'
            foffset.write(offset_line_str % tuple(offset_line))

        if len(overlap_interval_list) > 0:
            overlap_bases = getOverlapIntervalSize(overlap_interval_list)

        refsize = int(end) - int(start) + 1
        ref_totalSize += refsize
        # only count overlap if the reference interval are counted as found
        if overlap_bases > options.cutoff_base * refsize:
            overlap_totalSize += overlap_bases
            tp_interval += 1
        # not enough overlap
        else:
            for interval in overlap:
                overlap_intervals.remove(interval)

        # The fraction of a reference interval covered by all the query interval overlapping with it
        overlap_percentage = round(overlap_bases * 100 / refsize, 3)

        line = [start, end, (end - start + 1), left_ext, right_ext]
        overlap_len = len(sorted_overlap)
        line_str = '(%s, %s, %s)\t%d\t%d'

        # Compute the gap between intervals when there are more than two intervals
        suffix_str = ';%s'
        query_size = 0
        for i, overlap in enumerate(sorted_overlap):
            if i == 0:
                line_str += '\t%s'
            else:
                line_str += suffix_str
            line.append(overlap)
            query_size += overlap[1] - overlap[0] + 1

        line_str += '\t%s'
        line.append(query_size)
        # The fraction of a reference interval covered by all the query interval overlapping with it
        if query_size > 0:
            overlap_percentage_pred = round(overlap_bases * 100 / query_size,
                                            3)
        else:
            overlap_percentage_pred = 0

        gaps = []
        if overlap_len > 1:
            p1 = sorted_overlap[0][1]
            for overlap in sorted_overlap[1:]:
                p2 = overlap[0]
                # 2-1 = 1, but they are adjacent
                gap = p2 - p1 - 1
                gaps.append(gap)
                p1 = overlap[1]
        # Find all genes in a reference interval
        if options.pttfile or options.gene_list:
            # 8 fields
            line_str += '\t%s\t%s\t%s\t%s\t%s\t%s\t%s\t%s'
            line.extend([
                overlap_percentage, overlap_percentage_pred, num_refgenes,
                num_predictedgenes, num_overlapgenes, overlap_gene_percentage,
                overlap_gene_percentage_pred, gaps
            ])
        else:
            line_str += '\t%s\t%s\t%s'
            line.extend([overlap_percentage, overlap_percentage_pred, gaps])
        print line_str % tuple(line)

    # The number of all the predicted intervals overlapping with the reference
    num_overlap = len(set(overlap_intervals))
    print '\nThe number of predicted intervals: %s' % len(query_intervals)
    print 'The number of reference intervals: %s' % len(ref_intervals)
    print 'The number of predicted reference intervals (TPs, at least certain fraction (0.4 by default) of the reference interval is predicted): %s' % tp_interval
    print 'The number of unpredicted reference intervals (FNs): %s' % (
        len(ref_intervals) - tp_interval)
    print 'The number of reference intervals not overlapping with predictions: %s' % num_nooverlap
    # Some intervals may be overlapped with different reference intervals, so this number may be overestimated
    print 'The number of predicted intervals overlapping with the reference: %s' % num_overlap
    # The query intervals not overlapping with the reference
    unique_intervals = set(query_intervals) - set(overlap_intervals)
    print 'The number of predicted intervals not overlapping with the reference (FPs): %s' % len(
        unique_intervals)

    ############################## PR in #intervals ###############################################
    '''
    This is in term of #intervals. Not meaningful. For reference.
    '''
    # recall: TP/(TP+FN), precision= TP/(TP+FP)
    tp = tp_interval
    # The number of predicted intervals not overlapping with the reference, hard to be mapped to reference intervals
    # fp = len(query_intervals) - tp
    fp = len(unique_intervals)
    # By definition, FN should be the number of unpredicted intervals overlapping with the reference interval
    # Here, for convenience, it is the number of unpredicted reference intervals
    fn = len(ref_intervals) - tp
    real = len(ref_intervals)

    predicted = len(set(query_intervals))
    '''
    When using real, recall may be larger than 1, as many predicted intervals may overlap with the same GI.
    So we use tp+fn instead.
    '''
    recall = tp / (tp + fn)
    precision = tp / (tp + fp)
    if recall != 0 and precision != 0:
        fmeasure = 2 * recall * precision / (recall + precision)
    else:
        fmeasure = 0
    print 'Interval Recall: %.3f\tPrecision: %.3f\tF-measure: %.3f\tPredicted intervals: %s\tTotal intervals: %s\t' % (
        recall, precision, fmeasure, predicted, real)

    ############################## PR in #overlap bases ###############################################
    '''
    This is in term of overlapping bases.
    '''
    tp = overlap_totalSize
    real = get_interval_length(ref_intervals)
    # Two alternative ways to get the number of bases in reference intervals
    assert real == ref_totalSize
    # Merge before counting as query_intervals may be overlapping
    predicted = getOverlapIntervalSize(query_intervals)

    recall = tp / real
    precision = tp / predicted
    if recall != 0 and precision != 0:
        fmeasure = 2 * recall * precision / (recall + precision)
    else:
        fmeasure = 0
    # Append offset error at the end of last line
    (avg_left, avg_right, avg_offset) = getAvgBoundaryError(extentions)

    print 'The number of reference bases: %d' % real

    ############################## PR in #overlap genes ###############################################
    '''
    This is in term of overlapping genes.
    '''
    if options.pttfile or options.gene_list:
        tp = len(set(overlap_total_genes))
        g_real = len(set(ref_total_genes))
        query_total_genes = set()
        for query_interval in query_intervals:
            query_genes = getGenesInInterval(query_interval, genelist,
                                             options.cutoff_gene)
            query_total_genes.update(query_genes)
        g_predicted = len(query_total_genes)

        g_recall = tp / g_real
        g_precision = tp / g_predicted
        if g_recall != 0 and g_precision != 0:
            g_fmeasure = 2 * g_recall * g_precision / (g_recall + g_precision)
        else:
            g_fmeasure = 0
        diff_fmeasure = g_fmeasure - fmeasure
        print 'The number of reference genes: %d' % g_real

        # Other measures
        fn = g_real - tp
        fp = g_predicted - tp
        neg = len(genelist) - g_real
        tn = neg - fp
        tnr = (tn) / (tn + fp)
        oacc = (tp + tn) / (tp + tn + fn + fp)
        acc = (g_recall + tnr) / 2
        mcc = (tp * tn - fp * fn) / math.sqrt(
            (tp + fp) * (tp + fn) * (tn + fp) * (tn + fn))

    if options.pttfile or options.gene_list:
        format_str = 'Bases Recall: %.3f\tPrecision: %.3f\tF-measure: %.3f\tF-measure Difference: %.3f\tLeft offset: %d\tRight: %d\tPredicted bases: %d\tOverlap bases: %d\tAverage interval size: %d\tOverlap genes: %d\tPredicted genes: %d\tPredicted intervals: %d\tGene Recall: %.3f\tPrecision: %.3f\tF-measure: %.3f\tAvg offset: %d\tTNR: %.3f\tOACC: %.3f\tACC: %.3f\tMCC: %.3f'
        print format_str % (
            recall, precision, fmeasure, diff_fmeasure, avg_left, avg_right,
            predicted, overlap_totalSize, avg_query_len,
            len(overlap_total_genes), g_predicted, len(query_intervals),
            g_recall, g_precision, g_fmeasure, avg_offset, tnr, oacc, acc, mcc)
    else:
        format_str = 'Bases Recall: %.3f\tPrecision: %.3f\tF-measure: %.3f\tLeft offset: %d\tRight: %d\tPredicted bases: %d\tOverlap bases: %d\tPredicted intervals: %d\tAverage interval size: %d\tAvg offset: %d'
        print format_str % (recall, precision, fmeasure, avg_left,
                            avg_right, predicted, overlap_totalSize,
                            len(query_intervals), avg_query_len, avg_offset)

    # Output FP predictions
    if options.overlap:
        # Sort intervals
        overlap_intervals = sorted(set(overlap_intervals),
                                   key=lambda x: (int(x[0]), int(x[1])))
        writeListOfTupleToFile(options.overlap, overlap_intervals)
    if options.output:
        unique_intervals = sorted(unique_intervals,
                                  key=lambda x: (int(x[0]), int(x[1])))
        writeListOfTupleToFile(options.output, unique_intervals)

    if foffset != '':
        foffset.close()

    return (recall, precision, fmeasure, options.output)
コード例 #8
0
ファイル: eval4contigs.py プロジェクト: icelu/GI_Cluster
def get_overlap_statistics(ref_intervals, query_intervals, genelist, options):
    '''
    For each reference interval, check the query intervals that overlap with it
    '''
    overlap_intervals = {}
    unique_intervals = {}
    extentions_dict = {}
    # The number of reference intervals with no overlapping query intervals
    num_nooverlap = 0

    ref_totalSize = 0
    overlap_totalSize = 0
    tp_interval = 0

    for id, r_intervals in ref_intervals.items():
        # Find query intervals with the same ID
        if id not in query_intervals.keys():
            continue
        q_intervals = query_intervals[id]
        tree = get_window_tree(q_intervals)
        overlap_intervals[id] = []
        unique_intervals[id] = []
        extentions = []
        for start, end, size in r_intervals:
            # Find all query intervals overlapping with the reference
            overlap = find(start, end, tree)
            if len(
                    overlap
            ) == 0:  # There is no query intervals overlapping with a reference interval
                num_nooverlap += 1
            # Update overlap_intervals for all reference intervals
            overlap_intervals[id].extend(overlap)

            ################################## boundary offset ######################################################
            sorted_overlap = sorted(overlap,
                                    key=lambda x: (int(x[0]), int(x[1])))
            left_ext = 0
            right_ext = 0
            if (len(sorted_overlap) > 0):
                left_ext = sorted_overlap[0][0] - start
                right_ext = end - sorted_overlap[-1][1]
                extentions.append((abs(left_ext), abs(right_ext)))

            ##################################### base metrics ####################################################
            # Check the coverage of the overlap
            overlap_bases = 0
            # Record the intervals of reference and query to get the union of overlapping bases
            overlap_interval_list = []
            for interval in overlap:
                overlap_interval = get_overlap_interval(interval, (start, end))
                if overlap_interval is not None:
                    overlap_interval_list.append(overlap_interval)

            if len(overlap_interval_list) > 0:
                # Suppose intervals in overlap_interval_list do not overlap
                overlap_bases = get_interval_length(overlap_interval_list)

            refsize = int(end) - int(start) + 1
            ref_totalSize += refsize
            # only count overlap if the reference interval are counted as found
            if overlap_bases > options.cutoff_base * refsize:
                overlap_totalSize += overlap_bases
                tp_interval += 1
            # not enough overlap
            else:
                for interval in overlap:
                    overlap_intervals[id].remove(interval)

            # The fraction of a reference interval covered by all the query interval overlapping with it
            overlap_percentage = round(overlap_bases * 100 / refsize, 3)

            suffix_str = '\t%s'
            line = [id, start, end, (end - start + 1), left_ext, right_ext]
            overlap_len = len(sorted_overlap)
            line_str = '%d\t(%s, %s, %s)\t%d\t%d'

            # Compute the gap between intervals when there are more than two intervals
            for overlap in sorted_overlap:
                line_str += suffix_str
                line.append(overlap)

            gaps = []
            if overlap_len > 1:
                p1 = sorted_overlap[0][1]
                for overlap in sorted_overlap[1:]:
                    p2 = overlap[0]
                    gap = p2 - p1 - 1
                    gaps.append(gap)
                    p1 = overlap[1]
            # Find all genes in a reference interval
            if options.genefile:
                line_str += '\t%s\t%s\t%s\t%s\t%s\t%s'
                line.extend([
                    overlap_percentage, num_refgenes, num_predictedgenes,
                    num_overlapgenes, overlap_gene_percentage, gaps
                ])
            else:
                line_str += '\t%s\t%s'
                line.extend([overlap_percentage, gaps])
            print line_str % tuple(line)

        unique_intervals[id] = list(
            set(query_intervals[id]) - set(overlap_intervals[id]))
        extentions_dict[id] = extentions

    # For all the contigs
    # The number of all the predicted intervals overlapping with the reference
    # Note: all the intervals are dictionaries
    num_overlap = sum(len(v) for v in overlap_intervals.itervalues())
    num_ref = sum(len(v) for v in ref_intervals.itervalues())
    num_pred = sum(len(v) for v in query_intervals.itervalues())
    for id in query_intervals.keys():
        if id not in unique_intervals.keys():
            unique_intervals[id] = query_intervals[id]
    num_uniq = sum(len(v) for v in unique_intervals.itervalues())
    print '\nThe number of predicted intervals: %s' % num_pred
    print 'The number of reference intervals: %s' % num_ref
    print 'The number of predicted reference intervals (TPs): %s' % tp_interval
    print 'The number of unpredicted reference intervals (FNs): %s' % (
        num_ref - tp_interval)
    print 'The number of reference intervals not overlapping with predictions: %s' % num_nooverlap
    # Some intervals may be overlapped with different reference intervals, so this number may be overestimated
    print 'The number of predicted intervals overlapping with the reference: %s' % num_overlap
    print 'The number of predicted intervals not overlapping with the reference (FPs): %s' % num_uniq
    #
    # ############################## PR in #overlap bases ###############################################

    tp = overlap_totalSize
    real = 0
    for id in ref_intervals.keys():
        real += get_interval_length(ref_intervals[id])

    # Two alternative ways to get the number of bases in reference intervals
    assert real == ref_totalSize
    # Merge before counting as query_intervals may be overlapping
    predicted = 0
    for id in query_intervals.keys():
        predicted += getOverlapIntervalSize(query_intervals[id])

    recall = tp / real
    precision = tp / predicted
    if recall != 0 and precision != 0:
        fmeasure = 2 * recall * precision / (recall + precision)
    else:
        fmeasure = 0
    # Append offset error at the end of last line
    lavg = 0
    ravg = 0
    oavg = 0
    count_ext = 0
    for id, extentions in extentions_dict.items():
        (avg_left, avg_right, avg_offset) = getAvgBoundaryError(extentions)
        lavg += avg_left
        ravg += avg_right
        oavg += avg_offset
        count_ext += 1
    avg_offset_all = oavg / count_ext
    avg_left_all = lavg / count_ext
    avg_right_all = ravg / count_ext
    avg_size = predicted / num_pred
    print 'The number of reference bases: %d' % real
    format_str = 'Bases Recall: %.3f\tPrecision: %.3f\tF-measure: %.3f\tLeft offset: %d\tRight: %d\tPredicted bases: %d\tOverlap bases: %d\tPredicted intervals: %d\tAverage interval size: %d\tAvg offset: %d'
    print format_str % (recall, precision, fmeasure, avg_left_all,
                        avg_right_all, predicted, overlap_totalSize, num_pred,
                        avg_size, avg_offset_all)