def consensus(lst):
     if len(lst) == 0:
         return None
     # first check for read through transcripts involving multiple
     # reference genes
     same_strand_hits = collections.defaultdict(lambda: [])
     for m in lst:
         category_int = Category.to_int(m.category)
         if category_int == Category.SAME_STRAND:
             same_strand_hits[m.ref_gene_id].append(m)
     # no same strand matches so don't need to worry about
     # read-throughs or multiple gene types
     if len(same_strand_hits) == 0:
         return MatchStats.choose_best(lst)
     # get consensus match from same strand overlapping genes
     total_introns = lst[0].num_introns
     total_length = lst[0].length
     shared_introns = 0
     shared_same_strand_bp = 0
     hits = []
     for genelst in same_strand_hits.itervalues():
         m = MatchStats.choose_best(genelst).copy()
         m.ref_gene_type = ','.join(
             sorted(set(m.ref_gene_type for m in genelst)))
         total_introns += m.ref_num_introns
         total_length += m.ref_length
         shared_introns += m.shared_introns
         shared_same_strand_bp += m.shared_same_strand_bp
         hits.append(m)
     # sort reference genes by position
     hits = MatchStats.sort_genome(hits)
     # make a new MatchStats object
     hit = hits[0].copy()
     hit.ref_transcript_id = ','.join(x.ref_transcript_id for x in hits)
     hit.ref_gene_id = ','.join(x.ref_gene_id for x in hits)
     hit.ref_orig_gene_id = ','.join(x.ref_orig_gene_id for x in hits)
     hit.ref_gene_name = ','.join(x.ref_gene_name for x in hits)
     hit.ref_source = ','.join(x.ref_source for x in hits)
     hit.ref_gene_type = ','.join(x.ref_gene_type for x in hits)
     hit.ref_locus = ','.join(x.ref_locus for x in hits)
     hit.ref_length = ','.join(str(x.ref_length) for x in hits)
     hit.ref_num_introns = ','.join(str(x.ref_num_introns) for x in hits)
     hit.shared_same_strand_bp = shared_same_strand_bp
     hit.shared_opp_strand_bp = 0
     hit.shared_introns = shared_introns
     hit.shared_splicing = any(m.shared_splicing for m in hits)
     hit.distance = 0
     if len(same_strand_hits) > 1:
         hit.category = Category.to_str(Category.READ_THROUGH)
     return hit
 def consensus(lst):
     if len(lst) == 0:
         return None
     # first check for read through transcripts involving multiple 
     # reference genes
     same_strand_hits = collections.defaultdict(lambda: [])
     for m in lst:
         category_int = Category.to_int(m.category)
         if category_int == Category.SAME_STRAND:
             same_strand_hits[m.ref_gene_id].append(m)
     # no same strand matches so don't need to worry about
     # read-throughs or multiple gene types
     if len(same_strand_hits) == 0:
         return MatchStats.choose_best(lst)
     # get consensus match from same strand overlapping genes
     total_introns = lst[0].num_introns
     total_length = lst[0].length
     shared_introns = 0
     shared_same_strand_bp = 0
     hits = []
     for genelst in same_strand_hits.itervalues():
         m = MatchStats.choose_best(genelst).copy()
         m.ref_gene_type = ','.join(sorted(set(m.ref_gene_type for m in genelst)))           
         total_introns += m.ref_num_introns
         total_length += m.ref_length
         shared_introns += m.shared_introns
         shared_same_strand_bp += m.shared_same_strand_bp
         hits.append(m)
     # sort reference genes by position
     hits = MatchStats.sort_genome(hits)
     # make a new MatchStats object
     hit = hits[0].copy()
     hit.ref_transcript_id = ','.join(x.ref_transcript_id for x in hits)
     hit.ref_gene_id = ','.join(x.ref_gene_id for x in hits)
     hit.ref_orig_gene_id = ','.join(x.ref_orig_gene_id for x in hits)
     hit.ref_gene_name = ','.join(x.ref_gene_name for x in hits)
     hit.ref_source = ','.join(x.ref_source for x in hits)
     hit.ref_gene_type = ','.join(x.ref_gene_type for x in hits)
     hit.ref_locus = ','.join(x.ref_locus for x in hits)
     hit.ref_length = ','.join(str(x.ref_length) for x in hits)
     hit.ref_num_introns = ','.join(str(x.ref_num_introns) for x in hits)
     hit.shared_same_strand_bp = shared_same_strand_bp
     hit.shared_opp_strand_bp = 0
     hit.shared_introns = shared_introns
     hit.shared_splicing = any(m.shared_splicing for m in hits)
     hit.distance = 0
     if len(same_strand_hits) > 1:
         hit.category = Category.to_str(Category.READ_THROUGH)
     return hit
 def choose_best(lst):
     hits = []
     for m in lst:
         total_introns = m.num_introns + m.ref_num_introns
         if total_introns == 0:
             intron_frac = 0.0
         else:
             intron_frac = float(
                 m.shared_introns) / (total_introns - m.shared_introns)
         same_strand_frac = float(m.shared_same_strand_bp) / (
             m.length + m.ref_length - m.shared_same_strand_bp)
         opp_strand_frac = float(m.shared_opp_strand_bp) / (
             m.length + m.ref_length - m.shared_opp_strand_bp)
         category_int = Category.to_int(m.category)
         hits.append(
             (int(m.shared_splicing), intron_frac, same_strand_frac,
              opp_strand_frac,
              int(category_int == Category.INTRONIC_SAME_STRAND),
              int(category_int == Category.INTRONIC_OPP_STRAND),
              int(category_int == Category.INTERLEAVING_SAME_STRAND),
              int(category_int == Category.INTERLEAVING_OPP_STRAND),
              int(category_int == Category.ENCOMPASSING_SAME_STRAND),
              int(category_int == Category.ENCOMPASSING_OPP_STRAND),
              int(category_int == Category.INTERGENIC), -abs(m.distance),
              m))
     # sort matches
     hits.sort(reverse=True)
     hit = hits[0][-1]
     return hit
 def choose_best(lst):
     hits = []
     for m in lst:
         total_introns = m.num_introns + m.ref_num_introns
         if total_introns == 0:
             intron_frac = 0.0
         else:
             intron_frac = float(m.shared_introns) / (total_introns - m.shared_introns)
         same_strand_frac = float(m.shared_same_strand_bp) / (m.length + m.ref_length - m.shared_same_strand_bp)
         opp_strand_frac = float(m.shared_opp_strand_bp) / (m.length + m.ref_length - m.shared_opp_strand_bp)
         category_int = Category.to_int(m.category)
         hits.append((int(m.shared_splicing), intron_frac, 
                      same_strand_frac, opp_strand_frac, 
                      int(category_int == Category.INTRONIC_SAME_STRAND),
                      int(category_int == Category.INTRONIC_OPP_STRAND),                                 
                      int(category_int == Category.INTERLEAVING_SAME_STRAND),
                      int(category_int == Category.INTERLEAVING_OPP_STRAND),
                      int(category_int == Category.ENCOMPASSING_SAME_STRAND),
                      int(category_int == Category.ENCOMPASSING_OPP_STRAND),                                                         
                      int(category_int == Category.INTERGENIC),                                                         
                      -abs(m.distance), m))
     # sort matches
     hits.sort(reverse=True)
     hit = hits[0][-1]
     return hit
def impute_transcript(t, gene_map, transcript_map):
    catstr = t.attrs['category']
    catint = Category.to_int(catstr)
    length = t.length
    gene_type = t.attrs.get('gene_type', None)
    ref_gene_type = t.attrs['ref_gene_type']
    # ref_gene_type can be  be multiple gene types separated by commas.
    # convert into a set of unique gene types
    ref_gene_types = set(ref_gene_type.split(','))
    transcript_types = set(
        impute_transcript_type(catint, length, gene_type, x)
        for x in ref_gene_types)
    transcript_categories = set(GENCODE_CATEGORY_MAP[x]
                                for x in transcript_types)
    # sorted and join unique types/categories to make conglomerated category assignments
    transcript_type = ','.join(sorted(transcript_types))
    transcript_category = ','.join(sorted(transcript_categories))
    # use first gene in read-through for name
    #ref_gene_name = t.attrs['ref_gene_name'].split(',')[0]
    # hyphenate read-through genes into long name
    ref_gene_name = '-'.join(t.attrs['ref_gene_name'].split(','))
    # resolve upper/lower case issue with gene names from
    # different databases
    transcript_name = ref_gene_name.upper()
    # build transcript name
    if transcript_name == 'NONE':
        transcript_name = str(t.chrom)
    # append category
    if catint != Category.SAME_STRAND:
        transcript_name = '%s.%s' % (transcript_name, catstr)
    # transcript name string is key to a dictionary that
    # associates each gene id with an integer number
    gene_id = t.attrs['gene_id']
    gene_dict = gene_map[transcript_name]
    if gene_id not in gene_dict:
        gene_num = len(gene_dict) + 1
        gene_dict[gene_id] = gene_num
    else:
        gene_num = gene_dict[gene_id]
    # append gene integers to name
    transcript_name = '%s.%d' % (transcript_name, gene_num)
    # gene id is also key to dict that associates each isoform
    # of gene with integer number
    t_id = t.attrs['transcript_id']
    t_dict = transcript_map[transcript_name]
    if t_id not in t_dict:
        t_num = len(t_dict) + 1
        t_dict[t_id] = t_num
    else:
        t_num = t_dict[t_id]
    # append gene/transcript integers to gene name
    transcript_name = '%s.%d' % (transcript_name, t_num)
    return transcript_type, transcript_category, transcript_name
def impute_transcript(t, gene_map, transcript_map):
    catstr = t.attrs['category']
    catint = Category.to_int(catstr)
    length = t.length
    gene_type = t.attrs.get('gene_type', None)
    ref_gene_type = t.attrs['ref_gene_type']
    # ref_gene_type can be  be multiple gene types separated by commas. 
    # convert into a set of unique gene types
    ref_gene_types = set(ref_gene_type.split(','))
    transcript_types = set(impute_transcript_type(catint, length, gene_type, x) for x in ref_gene_types)
    transcript_categories = set(GENCODE_CATEGORY_MAP[x] for x in transcript_types)
    # sorted and join unique types/categories to make conglomerated category assignments 
    transcript_type = ','.join(sorted(transcript_types))
    transcript_category = ','.join(sorted(transcript_categories))
    # use first gene in read-through for name
    #ref_gene_name = t.attrs['ref_gene_name'].split(',')[0]
    # hyphenate read-through genes into long name
    ref_gene_name = '-'.join(t.attrs['ref_gene_name'].split(','))
    # resolve upper/lower case issue with gene names from 
    # different databases
    transcript_name = ref_gene_name.upper()
    # build transcript name
    if transcript_name == 'NONE':
        transcript_name = str(t.chrom)
    # append category
    if catint != Category.SAME_STRAND:
        transcript_name = '%s.%s' % (transcript_name, catstr)
    # transcript name string is key to a dictionary that
    # associates each gene id with an integer number
    gene_id = t.attrs['gene_id']
    gene_dict = gene_map[transcript_name]
    if gene_id not in gene_dict:
        gene_num = len(gene_dict) + 1
        gene_dict[gene_id] = gene_num
    else:
        gene_num = gene_dict[gene_id]
    # append gene integers to name
    transcript_name = '%s.%d' % (transcript_name, gene_num)
    # gene id is also key to dict that associates each isoform
    # of gene with integer number
    t_id = t.attrs['transcript_id']
    t_dict = transcript_map[transcript_name]
    if t_id not in t_dict:
        t_num = len(t_dict) + 1
        t_dict[t_id] = t_num
    else:
        t_num = t_dict[t_id]
    # append gene/transcript integers to gene name
    transcript_name = '%s.%d' % (transcript_name, t_num)
    return transcript_type, transcript_category, transcript_name
def main():
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument('--rename', dest='rename', action='store_true')
    parser.add_argument('gtf_file')
    args = parser.parse_args()
    gtf_file = args.gtf_file
    rename = args.rename
    if not os.path.exists(gtf_file):
        parser.error("GTF file '%s' not found" % (gtf_file))
    # parse transcripts
    num_transcripts = 0
    # keep track of redundant gene/transcript counts
    gene_map = collections.defaultdict(lambda: {})
    transcript_map = collections.defaultdict(lambda: {})
    for transcripts in parse_gtf(open(gtf_file)):
        for t in transcripts:
            catstr = t.attrs['category']
            catint = Category.to_int(catstr)
            gene_type = t.attrs.get('gene_type', None)
            ref_gene_type = t.attrs['ref_gene_type']
            if catint == Category.SAME_STRAND:
                # impute gene type
                new_gene_type = ref_gene_type
            else:
                if gene_type == 'protein_coding':
                    # don't change protein coding genes
                    new_gene_type = gene_type
                elif t.length < 250:
                    # categorize small RNA separately
                    new_gene_type = 'misc_RNA'
                else:
                    if ref_gene_type == 'protein_coding':
                        # categorize based on overlap with reference
                        new_gene_type = PROTEIN_CATEGORY_MAP[catint]
                    else:
                        # reference is also non-coding
                        new_gene_type = 'lincRNA'
            # get gene category
            gene_category = GENCODE_CATEGORY_MAP[new_gene_type]
            new_gene_name = None
            if rename:
                # resolve upper/lower case issue with gene names from 
                # different databases
                ref_gene_name = t.attrs['ref_gene_name'].upper()
                # build new gene name                
                if ref_gene_name == 'NONE':
                    new_gene_name = str(t.attrs['source'])
                elif catint == Category.SAME_STRAND:
                    new_gene_name = str(ref_gene_name)
                else:
                    new_gene_name = '%s.%s' % (ref_gene_name, catstr)
                # gene name string is key to a dictionary that
                # associates each gene id with an integer number
                gene_id = t.attrs['gene_id']
                gene_dict = gene_map[new_gene_name]
                if gene_id not in gene_dict:
                    gene_num = len(gene_dict) + 1
                    gene_dict[gene_id] = gene_num
                else:
                    gene_num = gene_dict[gene_id]
                # gene id is also key to dict that associates each isoform
                # of gene with integer number
                t_id = t.attrs['transcript_id']
                t_dict = transcript_map[gene_id]
                if t_id not in t_dict:
                    t_num = len(t_dict) + 1
                    t_dict[t_id] = t_num
                else:
                    t_num = t_dict[t_id]
                # append gene/transcript integers to gene name
                new_gene_name = '%s.%d.%d' % (new_gene_name, gene_num, t_num)
            # write new attributes
            for f in t.to_gtf_features(source='assemblyline', score=1000):
                f.attrs['gene_type'] = new_gene_type
                f.attrs['gene_category'] = gene_category 
                if rename:
                    if 'gene_name' in f.attrs:
                        f.attrs['orig_gene_name'] = f.attrs['gene_name']
                    f.attrs['gene_name'] = new_gene_name        
                print str(f)
            num_transcripts += 1
    return 0
def compare_assemblies(ref_gtf_file, test_gtf_file, output_dir):
    # output files
    if not os.path.exists(output_dir):
        logging.info('Creating output dir: %s' % (output_dir))
        os.makedirs(output_dir)
    # merge step
    merged_gtf_file = os.path.join(output_dir, "merged.gtf")
    merged_sorted_gtf_file = os.path.splitext(merged_gtf_file)[0] + ".srt.gtf"
    merge_done_file = os.path.join(output_dir, 'merged.done')
    sort_done_file = os.path.join(output_dir, 'sort.done')
    if not os.path.exists(merge_done_file):
        # merge and sort ref/test gtf files
        logging.info("Merging reference and test GTF files")
        # make temporary file to store merged ref/test gtf files
        with open(merged_gtf_file, "w") as fileh:
            logging.info("Adding reference GTF file")
            add_gtf_file(ref_gtf_file, fileh, is_ref=True)
            logging.info("Adding test GTF file")
            add_gtf_file(test_gtf_file, fileh, is_ref=False)
        open(merge_done_file, 'w').close()
    if not os.path.exists(sort_done_file):
        logging.info("Sorting merged GTF file")
        # create temp directory
        tmp_dir = os.path.join(output_dir, 'tmp')
        if not os.path.exists(tmp_dir):
            logging.debug("Creating tmp directory '%s'" % (tmp_dir))
            os.makedirs(tmp_dir)
        sort_gtf(merged_gtf_file, merged_sorted_gtf_file, tmp_dir=tmp_dir)
        # cleanup
        shutil.rmtree(tmp_dir)
        open(sort_done_file, 'w').close()
    # compare assemblies
    overlapping_gtf_file = os.path.join(output_dir, 'overlapping.gtf')
    intergenic_tmp_gtf_file = os.path.join(output_dir, 'intergenic.tmp.gtf')
    overlapping_file = os.path.join(output_dir, 'overlapping.tsv')
    overlapping_consensus_file = os.path.join(output_dir,
                                              'overlapping.consensus.tsv')
    overlapping_done_file = os.path.join(output_dir, 'overlapping.done')
    stats_file = os.path.join(output_dir, 'stats.txt')
    stats_obj = GlobalStats()
    num_intergenic = 0
    if not os.path.exists(overlapping_done_file):
        logging.info("Comparing assemblies")
        gtf_fileh = open(overlapping_gtf_file, 'w')
        tmp_gtf_fileh = open(intergenic_tmp_gtf_file, 'w')
        overlapping_fileh = open(overlapping_file, 'w')
        overlapping_consensus_fileh = open(overlapping_consensus_file, 'w')
        for locus_transcripts in parse_gtf(open(merged_sorted_gtf_file)):
            locus_chrom = locus_transcripts[0].chrom
            locus_start = locus_transcripts[0].start
            locus_end = max(t.end for t in locus_transcripts)
            logging.debug(
                "[LOCUS] %s:%d-%d %d transcripts" %
                (locus_chrom, locus_start, locus_end, len(locus_transcripts)))
            for t, match_stats in compare_locus(locus_transcripts):
                if len(match_stats) == 0:
                    # write intergenic transcripts to analyze separately
                    t.attrs['category'] = Category.to_str(Category.INTERGENIC)
                    for f in t.to_gtf_features(source='assembly'):
                        print >> tmp_gtf_fileh, str(f)
                    num_intergenic += 1
                else:
                    # get consensus match information
                    consensus_match = MatchStats.consensus(match_stats)
                    assert consensus_match is not None
                    t.attrs['category'] = consensus_match.category
                    # add gtf attributes and write
                    for f in t.to_gtf_features(source='assembly'):
                        consensus_match.add_gtf_attributes(f)
                        print >> gtf_fileh, str(f)
                    # tab-delimited text output
                    print >> overlapping_consensus_fileh, str(consensus_match)
                    for ms in match_stats:
                        print >> overlapping_fileh, str(ms)
            # compute global statistics
            stats_obj.compute(locus_transcripts)
        logging.info("Reporting global statistics")
        with open(stats_file, 'w') as f:
            print >> f, stats_obj.report()
        gtf_fileh.close()
        tmp_gtf_fileh.close()
        overlapping_fileh.close()
        overlapping_consensus_fileh.close()
        open(overlapping_done_file, 'w').close()
    # resolve intergenic transcripts
    intergenic_gtf_file = os.path.join(output_dir, 'intergenic.gtf')
    intergenic_file = os.path.join(output_dir, 'intergenic.tsv')
    intergenic_best_file = os.path.join(output_dir, 'intergenic.best.tsv')
    intergenic_done_file = os.path.join(output_dir, 'intergenic.done')
    if not os.path.exists(intergenic_done_file):
        logging.info("Building interval index")
        locus_trees = build_locus_trees(merged_sorted_gtf_file)
        logging.info('Finding nearest matches to intergenic transcripts')
        gtf_fileh = open(intergenic_gtf_file, 'w')
        intergenic_fileh = open(intergenic_file, 'w')
        intergenic_best_fileh = open(intergenic_best_file, 'w')
        for locus_transcripts in parse_gtf(open(intergenic_tmp_gtf_file)):
            for t in locus_transcripts:
                # find nearest transcripts
                nearest_transcripts = find_nearest_transcripts(
                    t.chrom, t.start, t.end, t.strand, locus_trees)
                match_stats = []
                best_match = None
                if len(nearest_transcripts) == 0:
                    best_match = MatchStats.from_transcript(t)
                    best_match.category = Category.to_str(Category.INTERGENIC)
                    match_stats.append(best_match)
                else:
                    for ref, category, dist in nearest_transcripts:
                        # create a match object
                        ms = MatchStats.from_transcript(t, ref)
                        ms.shared_same_strand_bp = 0
                        ms.shared_opp_strand_bp = 0
                        ms.shared_introns = 0
                        ms.shared_splicing = False
                        ms.category = Category.to_str(category)
                        ms.distance = dist
                        match_stats.append(ms)
                    # choose the consensus match
                    best_match = MatchStats.choose_best(match_stats)
                # add gtf attributes and write
                for f in t.to_gtf_features(source='assembly'):
                    best_match.add_gtf_attributes(f)
                    print >> gtf_fileh, str(f)
                # write tab-delimited data
                print >> intergenic_best_fileh, str(best_match)
                for ms in match_stats:
                    print >> intergenic_fileh, str(ms)
        gtf_fileh.close()
        intergenic_fileh.close()
        intergenic_best_fileh.close()
        open(intergenic_done_file, 'w').close()
    # merge overlapping and intergenic results
    logging.info('Merging results')
    metadata_file = os.path.join(output_dir, 'metadata.txt')
    metadata_consensus_file = os.path.join(output_dir,
                                           'metadata.consensus.txt')
    assembly_gtf_file = os.path.join(output_dir, 'assembly.cmp.gtf')
    combine_done_file = os.path.join(output_dir, 'done')
    if not os.path.exists(combine_done_file):
        filenames = [overlapping_file, intergenic_file]
        with open(metadata_file, 'w') as outfile:
            print >> outfile, '\t'.join(MatchStats.header_fields())
            for fname in filenames:
                with open(fname) as infile:
                    for line in infile:
                        outfile.write(line)
        filenames = [overlapping_consensus_file, intergenic_best_file]
        with open(metadata_consensus_file, 'w') as outfile:
            print >> outfile, '\t'.join(MatchStats.header_fields())
            for fname in filenames:
                with open(fname) as infile:
                    for line in infile:
                        outfile.write(line)
        filenames = [intergenic_gtf_file, overlapping_gtf_file]
        with open(assembly_gtf_file, 'w') as outfile:
            for fname in filenames:
                with open(fname) as infile:
                    for line in infile:
                        outfile.write(line)
        open(combine_done_file, 'w').close()
    # cleanup
    logging.info("Done")
def compare_locus(transcripts):
    # store reference introns
    # (strand,start,end) -> ids (set)
    ref_intron_dict = collections.defaultdict(lambda: [])
    ref_node_dict = collections.defaultdict(lambda: [])
    ref_splicing_patterns = collections.defaultdict(lambda: [])
    ref_dict = {}
    # find the intron domains of the transcripts
    boundaries = find_exon_boundaries(transcripts)
    test_transcripts = []
    for t in transcripts:
        # separate ref and nonref transcripts
        is_ref = bool(int(t.attrs[GTFAttr.REF]))
        if is_ref:
            # add to dict
            ref_id = t.attrs[GTFAttr.TRANSCRIPT_ID]
            ref_dict[ref_id] = t
            # split exons that cross boundaries and get the
            # nodes in the transcript path
            for n in split_exons(t, boundaries):
                ref_node_dict[n].append(t)
            # add to introns
            splicing_pattern = []
            for start, end in t.iterintrons():
                intron = (t.strand, start, end)
                ref_intron_dict[intron].append(t)
                splicing_pattern.append(intron)
            # add to splicing patterns
            if len(splicing_pattern) > 0:
                ref_splicing_patterns[tuple(splicing_pattern)].append(t)
        else:
            test_transcripts.append(t)
    # index introns for fast intersection
    intron_tree = IntervalTree()
    for intron, refs in ref_intron_dict.iteritems():
        strand, start, end = intron
        intron_tree.insert_interval(
            Interval(start, end, strand=strand, value=refs))
    # categorize transcripts
    for t in test_transcripts:
        # get transcript nodes and introns
        nodes = list(split_exons(t, boundaries))
        introns = []
        for start, end in t.iterintrons():
            introns.append((t.strand, start, end))
        splicing_pattern = tuple(introns)
        # keep list of all matching ref transcripts
        matches = collections.defaultdict(lambda: Match())
        # dict of reference transcripts -> category -> list of nodes
        for n in nodes:
            if n in ref_node_dict:
                # look for reference transcripts that share this node
                for ref in ref_node_dict[n]:
                    if cmp_strand(t.strand, ref.strand):
                        c = Category.SAME_STRAND
                    else:
                        c = Category.OPP_STRAND
                    ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID]
                    m = matches[ref_id]
                    m.nodes[c].append(n)
            # look for reference introns that overlap this node
            for hit in intron_tree.find(*n):
                if cmp_strand(t.strand, hit.strand):
                    c = Category.INTRONIC_SAME_STRAND
                else:
                    c = Category.INTRONIC_OPP_STRAND
                for ref in hit.value:
                    ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID]
                    m = matches[ref_id]
                    m.nodes[c].append(n)
        # dict of introns -> list of reference transcripts
        for intron in introns:
            if intron in ref_intron_dict:
                for ref in ref_intron_dict[intron]:
                    ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID]
                    m = matches[ref_id]
                    m.introns.append(intron)
        # check splicing pattern matches
        if len(splicing_pattern) > 0:
            if splicing_pattern in ref_splicing_patterns:
                for ref in ref_splicing_patterns[splicing_pattern]:
                    ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID]
                    m = matches[ref_id]
                    m.splicing = True
        # go through the matches for this transcript and determine
        # the transcript category
        match_stats = []
        for ref_id, m in matches.iteritems():
            ref = ref_dict[ref_id]
            # calculate coverage
            same_strand_bp = sum(
                (n[1] - n[0]) for n in m.nodes[Category.SAME_STRAND])
            opp_strand_bp = sum(
                (n[1] - n[0]) for n in m.nodes[Category.OPP_STRAND])
            # count shared introns
            num_shared_introns = len(m.introns)
            # decide category for this test/ref transcript pair
            if m.splicing or (num_shared_introns > 0) or (same_strand_bp > 0):
                c = Category.SAME_STRAND
            elif (opp_strand_bp > 0):
                c = Category.OPP_STRAND
            else:
                # count nodes of different types
                num_same_strand = len(m.nodes[Category.SAME_STRAND])
                num_opp_strand = len(m.nodes[Category.OPP_STRAND])
                num_intronic_same_strand = len(
                    m.nodes[Category.INTRONIC_SAME_STRAND])
                num_intronic_opp_strand = len(
                    m.nodes[Category.INTRONIC_OPP_STRAND])
                assert num_same_strand == 0
                assert num_opp_strand == 0
                num_intronic = (num_intronic_same_strand +
                                num_intronic_opp_strand)
                assert num_intronic > 0
                if (num_intronic == len(nodes)):
                    # completely intronic
                    if num_intronic_same_strand > 0:
                        c = Category.INTRONIC_SAME_STRAND
                    else:
                        c = Category.INTRONIC_OPP_STRAND
                else:
                    # interleaving means some nodes intronic and other intergenic
                    if num_intronic_same_strand > 0:
                        c = Category.INTERLEAVING_SAME_STRAND
                    else:
                        c = Category.INTERLEAVING_OPP_STRAND
            # create a match object
            ms = MatchStats.from_transcript(t, ref)
            ms.shared_same_strand_bp = same_strand_bp
            ms.shared_opp_strand_bp = opp_strand_bp
            ms.shared_introns = num_shared_introns
            ms.shared_splicing = m.splicing
            ms.category = Category.to_str(c)
            ms.distance = 0
            match_stats.append(ms)
        yield (t, match_stats)
def main():
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument('--rename', dest='rename', action='store_true')
    parser.add_argument('gtf_file')
    args = parser.parse_args()
    gtf_file = args.gtf_file
    rename = args.rename
    if not os.path.exists(gtf_file):
        parser.error("GTF file '%s' not found" % (gtf_file))
    # parse transcripts
    num_transcripts = 0
    # keep track of redundant gene/transcript counts
    gene_map = collections.defaultdict(lambda: {})
    transcript_map = collections.defaultdict(lambda: {})
    for transcripts in parse_gtf(open(gtf_file)):
        for t in transcripts:
            catstr = t.attrs['category']
            catint = Category.to_int(catstr)
            gene_type = t.attrs.get('gene_type', None)
            ref_gene_type = t.attrs['ref_gene_type']
            if catint == Category.SAME_STRAND:
                # impute gene type
                new_gene_type = ref_gene_type
            else:
                if gene_type == 'protein_coding':
                    # don't change protein coding genes
                    new_gene_type = gene_type
                elif t.length < 250:
                    # categorize small RNA separately
                    new_gene_type = 'misc_RNA'
                else:
                    if ref_gene_type == 'protein_coding':
                        # categorize based on overlap with reference
                        new_gene_type = PROTEIN_CATEGORY_MAP[catint]
                    else:
                        # reference is also non-coding
                        new_gene_type = 'lincRNA'
            # get gene category
            gene_category = GENCODE_CATEGORY_MAP[new_gene_type]
            new_gene_name = None
            if rename:
                # resolve upper/lower case issue with gene names from
                # different databases
                ref_gene_name = t.attrs['ref_gene_name'].upper()
                # build new gene name
                if ref_gene_name == 'NONE':
                    new_gene_name = str(t.attrs['source'])
                elif catint == Category.SAME_STRAND:
                    new_gene_name = str(ref_gene_name)
                else:
                    new_gene_name = '%s.%s' % (ref_gene_name, catstr)
                # gene name string is key to a dictionary that
                # associates each gene id with an integer number
                gene_id = t.attrs['gene_id']
                gene_dict = gene_map[new_gene_name]
                if gene_id not in gene_dict:
                    gene_num = len(gene_dict) + 1
                    gene_dict[gene_id] = gene_num
                else:
                    gene_num = gene_dict[gene_id]
                # gene id is also key to dict that associates each isoform
                # of gene with integer number
                t_id = t.attrs['transcript_id']
                t_dict = transcript_map[gene_id]
                if t_id not in t_dict:
                    t_num = len(t_dict) + 1
                    t_dict[t_id] = t_num
                else:
                    t_num = t_dict[t_id]
                # append gene/transcript integers to gene name
                new_gene_name = '%s.%d.%d' % (new_gene_name, gene_num, t_num)
            # write new attributes
            for f in t.to_gtf_features(source='assemblyline', score=1000):
                f.attrs['gene_type'] = new_gene_type
                f.attrs['gene_category'] = gene_category
                if rename:
                    if 'gene_name' in f.attrs:
                        f.attrs['orig_gene_name'] = f.attrs['gene_name']
                    f.attrs['gene_name'] = new_gene_name
                print str(f)
            num_transcripts += 1
    return 0
def compare_assemblies(ref_gtf_file, test_gtf_file, output_dir): 
    # output files
    if not os.path.exists(output_dir):
        logging.info('Creating output dir: %s' % (output_dir))
        os.makedirs(output_dir)
    # merge step
    merged_gtf_file = os.path.join(output_dir, "merged.gtf")
    merged_sorted_gtf_file = os.path.splitext(merged_gtf_file)[0] + ".srt.gtf"
    merge_done_file = os.path.join(output_dir, 'merged.done')
    sort_done_file = os.path.join(output_dir, 'sort.done')
    if not os.path.exists(merge_done_file):
        # merge and sort ref/test gtf files
        logging.info("Merging reference and test GTF files")
        # make temporary file to store merged ref/test gtf files
        with open(merged_gtf_file, "w") as fileh:
            logging.info("Adding reference GTF file")
            add_gtf_file(ref_gtf_file, fileh, is_ref=True)
            logging.info("Adding test GTF file")
            add_gtf_file(test_gtf_file, fileh, is_ref=False)
        open(merge_done_file, 'w').close()
    if not os.path.exists(sort_done_file):        
        logging.info("Sorting merged GTF file")
        # create temp directory
        tmp_dir = os.path.join(output_dir, 'tmp')    
        if not os.path.exists(tmp_dir):
            logging.debug("Creating tmp directory '%s'" % (tmp_dir))
            os.makedirs(tmp_dir)
        sort_gtf(merged_gtf_file, merged_sorted_gtf_file, tmp_dir=tmp_dir)
        # cleanup
        shutil.rmtree(tmp_dir)
        open(sort_done_file, 'w').close()
    # compare assemblies
    overlapping_gtf_file = os.path.join(output_dir, 'overlapping.gtf')
    intergenic_tmp_gtf_file = os.path.join(output_dir, 'intergenic.tmp.gtf')
    overlapping_file = os.path.join(output_dir, 'overlapping.tsv')
    overlapping_consensus_file = os.path.join(output_dir, 'overlapping.consensus.tsv')
    overlapping_done_file = os.path.join(output_dir, 'overlapping.done')
    stats_file = os.path.join(output_dir, 'stats.txt')
    stats_obj = GlobalStats()
    num_intergenic = 0
    if not os.path.exists(overlapping_done_file):
        logging.info("Comparing assemblies")
        gtf_fileh = open(overlapping_gtf_file, 'w')
        tmp_gtf_fileh = open(intergenic_tmp_gtf_file, 'w')
        overlapping_fileh = open(overlapping_file, 'w')
        overlapping_consensus_fileh = open(overlapping_consensus_file, 'w')
        for locus_transcripts in parse_gtf(open(merged_sorted_gtf_file)):
            locus_chrom = locus_transcripts[0].chrom
            locus_start = locus_transcripts[0].start
            locus_end = max(t.end for t in locus_transcripts)
            logging.debug("[LOCUS] %s:%d-%d %d transcripts" % 
                          (locus_chrom, locus_start, locus_end, 
                           len(locus_transcripts)))
            for t, match_stats in compare_locus(locus_transcripts):
                if len(match_stats) == 0:
                    # write intergenic transcripts to analyze separately
                    t.attrs['category'] = Category.to_str(Category.INTERGENIC)
                    for f in t.to_gtf_features(source='assembly'):
                        print >>tmp_gtf_fileh, str(f)
                    num_intergenic += 1
                else:
                    # get consensus match information
                    consensus_match = MatchStats.consensus(match_stats)                    
                    assert consensus_match is not None
                    t.attrs['category'] = consensus_match.category
                    # add gtf attributes and write
                    for f in t.to_gtf_features(source='assembly'):
                        consensus_match.add_gtf_attributes(f)
                        print >>gtf_fileh, str(f)
                    # tab-delimited text output
                    print >>overlapping_consensus_fileh, str(consensus_match)
                    for ms in match_stats:
                        print >>overlapping_fileh, str(ms)
            # compute global statistics
            stats_obj.compute(locus_transcripts)
        logging.info("Reporting global statistics")
        with open(stats_file, 'w') as f:
            print >>f, stats_obj.report()
        gtf_fileh.close()
        tmp_gtf_fileh.close()
        overlapping_fileh.close()
        overlapping_consensus_fileh.close()
        open(overlapping_done_file, 'w').close()
    # resolve intergenic transcripts
    intergenic_gtf_file = os.path.join(output_dir, 'intergenic.gtf')
    intergenic_file = os.path.join(output_dir, 'intergenic.tsv')
    intergenic_best_file = os.path.join(output_dir, 'intergenic.best.tsv')
    intergenic_done_file = os.path.join(output_dir, 'intergenic.done')
    if not os.path.exists(intergenic_done_file):
        logging.info("Building interval index")
        locus_trees = build_locus_trees(merged_sorted_gtf_file)
        logging.info('Finding nearest matches to intergenic transcripts')
        gtf_fileh = open(intergenic_gtf_file, 'w')
        intergenic_fileh = open(intergenic_file, 'w')
        intergenic_best_fileh = open(intergenic_best_file, 'w')
        for locus_transcripts in parse_gtf(open(intergenic_tmp_gtf_file)):
            for t in locus_transcripts:
                # find nearest transcripts
                nearest_transcripts = find_nearest_transcripts(t.chrom, t.start, t.end, t.strand, locus_trees)
                match_stats = []
                best_match = None
                if len(nearest_transcripts) == 0:
                    best_match = MatchStats.from_transcript(t)
                    best_match.category = Category.to_str(Category.INTERGENIC)
                    match_stats.append(best_match)
                else:
                    for ref,category,dist in nearest_transcripts: 
                        # create a match object
                        ms = MatchStats.from_transcript(t, ref)
                        ms.shared_same_strand_bp = 0
                        ms.shared_opp_strand_bp = 0
                        ms.shared_introns = 0
                        ms.shared_splicing = False
                        ms.category = Category.to_str(category)
                        ms.distance = dist
                        match_stats.append(ms)
                    # choose the consensus match
                    best_match = MatchStats.choose_best(match_stats)
                # add gtf attributes and write
                for f in t.to_gtf_features(source='assembly'):
                    best_match.add_gtf_attributes(f)
                    print >>gtf_fileh, str(f)
                # write tab-delimited data
                print >>intergenic_best_fileh, str(best_match)
                for ms in match_stats:
                    print >>intergenic_fileh, str(ms)
        gtf_fileh.close()
        intergenic_fileh.close()
        intergenic_best_fileh.close()
        open(intergenic_done_file, 'w').close()
    # merge overlapping and intergenic results
    logging.info('Merging results')
    metadata_file = os.path.join(output_dir, 'metadata.txt')
    metadata_consensus_file = os.path.join(output_dir, 'metadata.consensus.txt')
    assembly_gtf_file = os.path.join(output_dir, 'assembly.cmp.gtf')
    combine_done_file = os.path.join(output_dir, 'done')
    if not os.path.exists(combine_done_file):
        filenames = [overlapping_file, intergenic_file]
        with open(metadata_file, 'w') as outfile:
            print >>outfile, '\t'.join(MatchStats.header_fields())
            for fname in filenames:
                with open(fname) as infile:
                    for line in infile:
                        outfile.write(line)
        filenames = [overlapping_consensus_file, intergenic_best_file]
        with open(metadata_consensus_file, 'w') as outfile:
            print >>outfile, '\t'.join(MatchStats.header_fields())
            for fname in filenames:
                with open(fname) as infile:
                    for line in infile:
                        outfile.write(line)
        filenames = [intergenic_gtf_file, overlapping_gtf_file]
        with open(assembly_gtf_file, 'w') as outfile:
            for fname in filenames:
                with open(fname) as infile:
                    for line in infile:
                        outfile.write(line)
        open(combine_done_file, 'w').close()
    # cleanup
    logging.info("Done")
def compare_locus(transcripts):
    # store reference introns
    # (strand,start,end) -> ids (set) 
    ref_intron_dict = collections.defaultdict(lambda: [])
    ref_node_dict = collections.defaultdict(lambda: [])
    ref_splicing_patterns = collections.defaultdict(lambda: [])
    ref_dict = {}
    # find the intron domains of the transcripts
    boundaries = find_exon_boundaries(transcripts)
    test_transcripts = []
    for t in transcripts:
        # separate ref and nonref transcripts
        is_ref = bool(int(t.attrs[GTFAttr.REF]))
        if is_ref:
            # add to dict
            ref_id = t.attrs[GTFAttr.TRANSCRIPT_ID]
            ref_dict[ref_id] = t
            # split exons that cross boundaries and get the
            # nodes in the transcript path
            for n in split_exons(t, boundaries):
                ref_node_dict[n].append(t)
            # add to introns
            splicing_pattern = []
            for start,end in t.iterintrons():
                intron = (t.strand, start, end)
                ref_intron_dict[intron].append(t)
                splicing_pattern.append(intron)
            # add to splicing patterns
            if len(splicing_pattern) > 0:
                ref_splicing_patterns[tuple(splicing_pattern)].append(t)
        else:
            test_transcripts.append(t)
    # index introns for fast intersection
    intron_tree = IntervalTree()
    for intron, refs in ref_intron_dict.iteritems():
        strand, start, end = intron
        intron_tree.insert_interval(Interval(start,end,strand=strand,value=refs))
    # categorize transcripts
    for t in test_transcripts:
        # get transcript nodes and introns
        nodes = list(split_exons(t, boundaries))
        introns = []
        for start,end in t.iterintrons():
            introns.append((t.strand,start,end))
        splicing_pattern = tuple(introns)
        # keep list of all matching ref transcripts
        matches = collections.defaultdict(lambda: Match())
        # dict of reference transcripts -> category -> list of nodes
        for n in nodes:
            if n in ref_node_dict:
                # look for reference transcripts that share this node
                for ref in ref_node_dict[n]:
                    if cmp_strand(t.strand, ref.strand):
                        c = Category.SAME_STRAND
                    else:
                        c = Category.OPP_STRAND
                    ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID]
                    m = matches[ref_id]
                    m.nodes[c].append(n)
            # look for reference introns that overlap this node
            for hit in intron_tree.find(*n):
                if cmp_strand(t.strand, hit.strand):
                    c = Category.INTRONIC_SAME_STRAND
                else:
                    c = Category.INTRONIC_OPP_STRAND
                for ref in hit.value: 
                    ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID]
                    m = matches[ref_id]
                    m.nodes[c].append(n)          
        # dict of introns -> list of reference transcripts
        for intron in introns:
            if intron in ref_intron_dict:
                for ref in ref_intron_dict[intron]:
                    ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID]
                    m = matches[ref_id]
                    m.introns.append(intron)
        # check splicing pattern matches
        if len(splicing_pattern) > 0:
            if splicing_pattern in ref_splicing_patterns:
                for ref in ref_splicing_patterns[splicing_pattern]:
                    ref_id = ref.attrs[GTFAttr.TRANSCRIPT_ID]
                    m = matches[ref_id]
                    m.splicing = True
        # go through the matches for this transcript and determine
        # the transcript category
        match_stats = []
        for ref_id, m in matches.iteritems():
            ref = ref_dict[ref_id]
            # calculate coverage
            same_strand_bp = sum((n[1] - n[0]) for n in m.nodes[Category.SAME_STRAND])
            opp_strand_bp = sum((n[1] - n[0]) for n in m.nodes[Category.OPP_STRAND])
            # count shared introns
            num_shared_introns = len(m.introns)
            # decide category for this test/ref transcript pair
            if m.splicing or (num_shared_introns > 0) or (same_strand_bp > 0):
                c = Category.SAME_STRAND
            elif (opp_strand_bp > 0):
                c = Category.OPP_STRAND
            else:
                # count nodes of different types
                num_same_strand = len(m.nodes[Category.SAME_STRAND])
                num_opp_strand = len(m.nodes[Category.OPP_STRAND])
                num_intronic_same_strand = len(m.nodes[Category.INTRONIC_SAME_STRAND])
                num_intronic_opp_strand = len(m.nodes[Category.INTRONIC_OPP_STRAND])
                assert num_same_strand == 0
                assert num_opp_strand == 0
                num_intronic = (num_intronic_same_strand +
                                num_intronic_opp_strand)
                assert num_intronic > 0
                if (num_intronic == len(nodes)):
                    # completely intronic
                    if num_intronic_same_strand > 0:
                        c = Category.INTRONIC_SAME_STRAND
                    else:
                        c = Category.INTRONIC_OPP_STRAND
                else:
                    # interleaving means some nodes intronic and other intergenic
                    if num_intronic_same_strand > 0:
                        c = Category.INTERLEAVING_SAME_STRAND
                    else:
                        c = Category.INTERLEAVING_OPP_STRAND
            # create a match object
            ms = MatchStats.from_transcript(t, ref)
            ms.shared_same_strand_bp = same_strand_bp
            ms.shared_opp_strand_bp = opp_strand_bp
            ms.shared_introns = num_shared_introns
            ms.shared_splicing = m.splicing
            ms.category = Category.to_str(c)            
            ms.distance = 0
            match_stats.append(ms)
        yield (t, match_stats)