def main():
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument("--library-type", dest="library_type",
                        default=LibraryTypes.FR_UNSTRANDED)
    parser.add_argument("--input-sam", dest="input_sam", action="store_true", 
                        default=False)
    parser.add_argument("--output-sam", dest="output_sam", action="store_true", 
                        default=False)
    parser.add_argument("genome_index")
    parser.add_argument("transcript_feature_file")
    parser.add_argument("input_sam_file")
    parser.add_argument("output_sam_file") 
    args = parser.parse_args()
    # read transcript features
    logging.debug("Reading transcript features")
    transcripts = list(TranscriptFeature.parse(open(args.transcript_feature_file)))
    return transcriptome_to_genome(args.genome_index,
                                   transcripts,
                                   args.input_sam_file, 
                                   args.output_sam_file,
                                   args.library_type,
                                   args.input_sam,
                                   args.output_sam)
예제 #2
0
def main():
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument('--max-fragment-length', dest="max_fragment_length", 
                        type=int, default=config.DEFAULT_MAX_FRAG_LENGTH)
    parser.add_argument('--library', dest="library_type", 
                        default=LibraryTypes.FR_UNSTRANDED)
    parser.add_argument('--max-multihits', dest="max_multihits", 
                        default=config.DEFAULT_MAX_MULTIHITS)
    parser.add_argument("transcript_file")
    parser.add_argument("input_bam_file")
    parser.add_argument("paired_bam_file")
    parser.add_argument("discordant_bam_file")
    parser.add_argument("unpaired_bam_file")
    parser.add_argument("unmapped_bam_file")
    parser.add_argument("multimap_bam_file")
    parser.add_argument("unresolved_bam_file")
    args = parser.parse_args()    
    # read transcript features
    logging.debug("Reading transcript features")
    transcripts = list(TranscriptFeature.parse(open(args.transcript_file)))
    return find_discordant_fragments(transcripts,
                                     args.input_bam_file, 
                                     args.paired_bam_file,
                                     args.discordant_bam_file,
                                     args.unpaired_bam_file,
                                     args.unmapped_bam_file, 
                                     args.multimap_bam_file,
                                     args.unresolved_bam_file,
                                     max_isize=args.max_fragment_length,
                                     max_multihits=args.max_multihits,
                                     library_type=args.library_type)
def discordant_reads_to_bedpe(index_dir, input_bam_file, output_file):
    # open BAM alignment file
    bamfh = pysam.Samfile(input_bam_file, "rb")
    # build a lookup table to get genomic intervals from transcripts
    logging.debug("Reading transcript features")
    transcript_file = os.path.join(index_dir, config.TRANSCRIPT_FEATURE_FILE)
    transcripts = list(TranscriptFeature.parse(open(transcript_file)))
    tid_tx_map = build_tid_transcript_map(bamfh, transcripts)
    outfh = open(output_file, "w")    
    logging.debug("Converting BAM to BEDPE format")
    for r5p,r3p in parse_gene_discordant_reads(bamfh):
        # store pertinent read information in lightweight structure called
        # DiscordantRead object. this departs from SAM format into a 
        # custom read format
        dr5p = DiscordantRead.from_read(r5p)
        dr3p = DiscordantRead.from_read(r3p)
        # get gene information
        tx5p = tid_tx_map[r5p.rname]
        tx3p = tid_tx_map[r3p.rname]
        # write bedpe format
        fields = [tx5p.tx_id, r5p.pos, r5p.aend,
                  tx3p.tx_id, r3p.pos, r3p.aend,
                  r5p.qname,  # read name
                  0, # score
                  tx5p.strand, tx3p.strand, # strand 1, strand 2
                  ]
        fields.append('|'.join(map(str, dr5p.to_list())))
        fields.append('|'.join(map(str, dr3p.to_list())))  
        print >>outfh, '\t'.join(map(str, fields)) 
    outfh.close()
    bamfh.close()
예제 #4
0
def main():
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument("--library-type",
                        dest="library_type",
                        default=LibraryTypes.FR_UNSTRANDED)
    parser.add_argument("--input-sam",
                        dest="input_sam",
                        action="store_true",
                        default=False)
    parser.add_argument("--output-sam",
                        dest="output_sam",
                        action="store_true",
                        default=False)
    parser.add_argument("genome_index")
    parser.add_argument("transcript_feature_file")
    parser.add_argument("input_sam_file")
    parser.add_argument("output_sam_file")
    args = parser.parse_args()
    # read transcript features
    logging.debug("Reading transcript features")
    transcripts = list(
        TranscriptFeature.parse(open(args.transcript_feature_file)))
    return transcriptome_to_genome(args.genome_index, transcripts,
                                   args.input_sam_file, args.output_sam_file,
                                   args.library_type, args.input_sam,
                                   args.output_sam)
def filter_multihits(transcript_file,
                     input_bam_file,
                     output_bam_file,
                     max_multihits=1):
    logging.debug("Reading transcript features")
    transcripts = list(TranscriptFeature.parse(open(transcript_file)))
    # parse and convert sam -> bam
    inbamfh = pysam.Samfile(input_bam_file, "rb")
    outbamfh = pysam.Samfile(output_bam_file, "wb", template=inbamfh)
    # build a transcript to genome coordinate map
    tid_tx_genome_map = build_tid_transcript_genome_map(outbamfh, transcripts)
    num_frags = 0
    logging.debug("Annotating and filtering multihits")
    for pe_reads in parse_pe_reads(inbamfh):
        mate_num_hits = []
        for reads in pe_reads:
            num_hits = annotate_multihits(reads, tid_tx_genome_map)
            mate_num_hits.append(num_hits)
        new_pe_reads = [[], []]
        if mate_num_hits[0] > max_multihits:
            r = copy_read(pe_reads[0][0])
            r.is_unmapped = True
            r.is_proper_pair = False
            r.is_secondary = False
            r.rname = -1
            r.pos = 0
            if mate_num_hits[1] > max_multihits:
                r.mate_is_unmapped = True
                r.mrnm = -1
                r.mpos = 0
            new_pe_reads[0] = [r]
        else:
            new_pe_reads[0] = pe_reads[0]
        if mate_num_hits[1] > max_multihits:
            r = copy_read(pe_reads[1][0])
            r.is_unmapped = True
            r.is_proper_pair = False
            r.is_secondary = False
            r.rname = -1
            r.pos = 0
            if mate_num_hits[0] > max_multihits:
                r.mate_is_unmapped = True
                r.mrnm = -1
                r.mpos = 0
            new_pe_reads[1] = [r]
        else:
            new_pe_reads[1] = pe_reads[1]
        for reads in pe_reads:
            for r in reads:
                outbamfh.write(r)
        num_frags += 1
    logging.debug("Found %d fragments" % (num_frags))
    inbamfh.close()
    outbamfh.close()
    return config.JOB_SUCCESS
def filter_multihits(transcript_file, input_bam_file, output_bam_file,
                     max_multihits=1):
    logging.debug("Reading transcript features")
    transcripts = list(TranscriptFeature.parse(open(transcript_file)))
    # parse and convert sam -> bam
    inbamfh = pysam.Samfile(input_bam_file, "rb")
    outbamfh = pysam.Samfile(output_bam_file, "wb", template=inbamfh)
    # build a transcript to genome coordinate map   
    tid_tx_genome_map = build_tid_transcript_genome_map(outbamfh, transcripts)
    num_frags = 0
    logging.debug("Annotating and filtering multihits")
    for pe_reads in parse_pe_reads(inbamfh):        
        mate_num_hits = []
        for reads in pe_reads:
            num_hits = annotate_multihits(reads, tid_tx_genome_map)
            mate_num_hits.append(num_hits)
        new_pe_reads = [[],[]]
        if mate_num_hits[0] > max_multihits:
            r = copy_read(pe_reads[0][0])
            r.is_unmapped = True
            r.is_proper_pair = False
            r.is_secondary = False
            r.rname = -1
            r.pos = 0
            if mate_num_hits[1] > max_multihits:
                r.mate_is_unmapped = True
                r.mrnm = -1
                r.mpos = 0
            new_pe_reads[0] = [r]
        else:
            new_pe_reads[0] = pe_reads[0]
        if mate_num_hits[1] > max_multihits:
            r = copy_read(pe_reads[1][0])
            r.is_unmapped = True
            r.is_proper_pair = False
            r.is_secondary = False
            r.rname = -1
            r.pos = 0
            if mate_num_hits[0] > max_multihits:
                r.mate_is_unmapped = True
                r.mrnm = -1
                r.mpos = 0
            new_pe_reads[1] = [r]
        else:
            new_pe_reads[1] = pe_reads[1]
        for reads in pe_reads:
            for r in reads:
                outbamfh.write(r)
        num_frags += 1
    logging.debug("Found %d fragments" % (num_frags))
    inbamfh.close()
    outbamfh.close()
    return config.JOB_SUCCESS
def find_discordant_fragments(input_bam_file, paired_bam_file,
                              unmapped_bam_file, index_dir, max_isize,
                              library_type):
    """
    parses BAM file and categorizes reads into several groups:
    - concordant
    - discordant within gene (splicing isoforms)
    - discordant between different genes (chimeras)
    """
    logging.info("Finding discordant read pair combinations")
    logging.debug("\tInput file: %s" % (input_bam_file))
    logging.debug("\tMax insert size: '%d'" % (max_isize))
    logging.debug("\tLibrary type: '%s'" % (library_type))
    logging.debug("\tGene paired file: %s" % (paired_bam_file))
    logging.debug("\tUnmapped file: %s" % (unmapped_bam_file))
    # setup input and output files
    bamfh = pysam.Samfile(input_bam_file, "rb")
    genefh = pysam.Samfile(paired_bam_file, "wb", template=bamfh)
    unmappedfh = pysam.Samfile(unmapped_bam_file, "wb", template=bamfh)
    # read transcript features
    logging.debug("Reading transcript features")
    transcript_file = os.path.join(index_dir, config.TRANSCRIPT_FEATURE_FILE)
    transcripts = list(TranscriptFeature.parse(open(transcript_file)))
    logging.debug("Building transcript lookup tables")
    # build a lookup table from bam tid index to transcript object
    tid_tx_map = build_tid_transcript_map(bamfh, transcripts)
    # build a transcript to genome coordinate map
    tid_tx_genome_map = build_tid_transcript_genome_map(bamfh, transcripts)
    logging.info("Parsing reads")
    for pe_reads in parse_pe_reads(bamfh):
        # add hit index and multimap information to read tags
        # this function also checks for unmapped reads
        any_unmapped = False
        for reads in pe_reads:
            any_unmapped = (any_unmapped or annotate_multihits(
                bamfh, reads, tid_tx_genome_map))
        if any_unmapped:
            # write to output as discordant reads and continue to
            # next fragment
            write_pe_reads(unmappedfh, pe_reads)
            continue
        # examine all read pairing combinations and rule out invalid pairings
        gene_pairs, unpaired_reads = classify_read_pairs(
            pe_reads, max_isize, library_type, tid_tx_map)
        if len(gene_pairs) > 0:
            write_pairs(genefh, gene_pairs)
        # TODO: do something with unpaired discordant reads?
    genefh.close()
    unmappedfh.close()
    bamfh.close()
    logging.info("Finished pairing reads")
    return config.JOB_SUCCESS
def find_discordant_fragments(input_bam_file, paired_bam_file, unmapped_bam_file, index_dir, max_isize, library_type):
    """
    parses BAM file and categorizes reads into several groups:
    - concordant
    - discordant within gene (splicing isoforms)
    - discordant between different genes (chimeras)
    """
    logging.info("Finding discordant read pair combinations")
    logging.debug("\tInput file: %s" % (input_bam_file))
    logging.debug("\tMax insert size: '%d'" % (max_isize))
    logging.debug("\tLibrary type: '%s'" % (library_type))
    logging.debug("\tGene paired file: %s" % (paired_bam_file))
    logging.debug("\tUnmapped file: %s" % (unmapped_bam_file))
    # setup input and output files
    bamfh = pysam.Samfile(input_bam_file, "rb")
    genefh = pysam.Samfile(paired_bam_file, "wb", template=bamfh)
    unmappedfh = pysam.Samfile(unmapped_bam_file, "wb", template=bamfh)
    # read transcript features
    logging.debug("Reading transcript features")
    transcript_file = os.path.join(index_dir, config.TRANSCRIPT_FEATURE_FILE)
    transcripts = list(TranscriptFeature.parse(open(transcript_file)))
    logging.debug("Building transcript lookup tables")
    # build a lookup table from bam tid index to transcript object
    tid_tx_map = build_tid_transcript_map(bamfh, transcripts)
    # build a transcript to genome coordinate map
    tid_tx_genome_map = build_tid_transcript_genome_map(bamfh, transcripts)
    logging.info("Parsing reads")
    for pe_reads in parse_pe_reads(bamfh):
        # add hit index and multimap information to read tags
        # this function also checks for unmapped reads
        any_unmapped = False
        for reads in pe_reads:
            any_unmapped = any_unmapped or annotate_multihits(bamfh, reads, tid_tx_genome_map)
        if any_unmapped:
            # write to output as discordant reads and continue to
            # next fragment
            write_pe_reads(unmappedfh, pe_reads)
            continue
        # examine all read pairing combinations and rule out invalid pairings
        gene_pairs, unpaired_reads = classify_read_pairs(pe_reads, max_isize, library_type, tid_tx_map)
        if len(gene_pairs) > 0:
            write_pairs(genefh, gene_pairs)
        # TODO: do something with unpaired discordant reads?
    genefh.close()
    unmappedfh.close()
    bamfh.close()
    logging.info("Finished pairing reads")
    return config.JOB_SUCCESS
예제 #9
0
def filter_highest_coverage_isoforms(index_dir, input_file, output_file):
    # read transcripts
    logging.debug("Reading transcripts")
    transcript_file = os.path.join(index_dir, config.TRANSCRIPT_FEATURE_FILE)
    transcripts = list(TranscriptFeature.parse(open(transcript_file)))
    # find highest coverage chimeras among isoforms
    kept_chimeras = get_highest_coverage_isoforms(input_file, transcripts)
    num_filtered_chimeras = 0
    f = open(output_file, "w")
    for c in Chimera.parse(open(input_file)):
        if c.name in kept_chimeras:
            num_filtered_chimeras += 1
            print >> f, '\t'.join(map(str, c.to_list()))
    f.close()
    logging.debug("\tAfter choosing best isoform: %d" % num_filtered_chimeras)
    return config.JOB_SUCCESS
예제 #10
0
def filter_highest_coverage_isoforms(index_dir, input_file, output_file):
    # read transcripts
    logging.debug("Reading transcripts")
    transcript_file = os.path.join(index_dir, config.TRANSCRIPT_FEATURE_FILE)
    transcripts = list(TranscriptFeature.parse(open(transcript_file)))
    # find highest coverage chimeras among isoforms
    kept_chimeras = get_highest_coverage_isoforms(input_file, transcripts)
    num_filtered_chimeras = 0
    f = open(output_file, "w")
    for c in Chimera.parse(open(input_file)):
        if c.name in kept_chimeras:
            num_filtered_chimeras += 1
            print >>f, '\t'.join(map(str, c.to_list()))
    f.close()
    logging.debug("\tAfter choosing best isoform: %d" % 
                  num_filtered_chimeras)
    return config.JOB_SUCCESS
예제 #11
0
def main():
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument("--ann", dest="annotation_source", default="ensembl")
    parser.add_argument("transcript_file")
    parser.add_argument("cluster_shelve_file")
    parser.add_argument("cluster_pair_file")
    parser.add_argument("read_name_file")
    parser.add_argument("output_file")
    args = parser.parse_args()    
    # read transcript features
    logging.debug("Reading transcript features")
    transcripts = list(TranscriptFeature.parse(open(args.transcript_file)))
    # run main function
    retcode = write_output(transcripts, args.cluster_shelve_file, 
                           args.cluster_pair_file, args.read_name_file, 
                           args.output_file, args.annotation_source)
    return retcode
예제 #12
0
def main():
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument("--ann", dest="annotation_source", default="ensembl")
    parser.add_argument("transcript_file")
    parser.add_argument("cluster_shelve_file")
    parser.add_argument("cluster_pair_file")
    parser.add_argument("read_name_file")
    parser.add_argument("output_file")
    args = parser.parse_args()
    # read transcript features
    logging.debug("Reading transcript features")
    transcripts = list(TranscriptFeature.parse(open(args.transcript_file)))
    # run main function
    retcode = write_output(transcripts, args.cluster_shelve_file,
                           args.cluster_pair_file, args.read_name_file,
                           args.output_file, args.annotation_source)
    return retcode
예제 #13
0
def transcript_features_to_fasta(transcript_feature_file, reference_seq_file):
    ref_fa = pysam.Fastafile(reference_seq_file)
    total = 0
    used = 0
    for g in TranscriptFeature.parse(open(transcript_feature_file)):
        total += 1
        exon_seqs = []
        error_occurred = False
        for start, end in g.exons:
            seq = ref_fa.fetch(g.chrom, start, end)
            if (not seq) or (len(seq) < (end - start)):
                logging.warning(
                    "transcript id %d exon %s:%d-%d not found in reference" %
                    (g.tx_id, g.chrom, start, end))
                error_occurred = True
                break
            exon_seqs.append(seq)
        if error_occurred:
            continue
        used += 1
        # make fasta record
        seq = ''.join(exon_seqs)
        # look for sequences containing only 'N's
        base_counts = collections.Counter(seq)
        valid_bases = sum(base_counts[x]
                          for x in ("A", "T", "G", "C", "a", "t", "g", "c"))
        if valid_bases == 0:
            logging.warning("transcript %d at pos %s:%d-%d lacks valid bases" %
                            (g.tx_id, g.chrom, g.tx_start, g.tx_end))
            continue
        # reverse complement negative stranded sequences
        if g.strand == '-':
            seq = DNA_reverse_complement(seq)
        # break seq onto multiple lines
        seqlines = split_seq(seq, BASES_PER_LINE)
        fa_record = (
            ">%d range=%s:%d-%d strand=%s\n%s" %
            (g.tx_id, g.chrom, g.tx_start, g.tx_end, g.strand, seqlines))
        yield g, fa_record
    logging.info("Used %d/%d gene features" % (used, total))
    ref_fa.close()
예제 #14
0
def transcript_features_to_fasta(transcript_feature_file, reference_seq_file):
    ref_fa = pysam.Fastafile(reference_seq_file)
    total = 0
    used = 0
    for g in TranscriptFeature.parse(open(transcript_feature_file)):
        total += 1
        exon_seqs = []
        error_occurred = False
        for start, end in g.exons:
            seq = ref_fa.fetch(g.chrom, start, end)
            if (not seq) or (len(seq) < (end - start)):
                logging.warning("transcript id %d exon %s:%d-%d not found in reference" % 
                                (g.tx_id, g.chrom, start, end))
                error_occurred = True
                break
            exon_seqs.append(seq)
        if error_occurred:
            continue
        used += 1
        # make fasta record
        seq = ''.join(exon_seqs)
        # look for sequences containing only 'N's
        base_counts = collections.Counter(seq)
        valid_bases = sum(base_counts[x] for x in 
                          ("A","T","G","C","a","t","g","c"))
        if valid_bases == 0:
            logging.warning("transcript %d at pos %s:%d-%d lacks valid bases" %
                            (g.tx_id, g.chrom, g.tx_start, g.tx_end))
            continue
        # reverse complement negative stranded sequences
        if g.strand == '-':
            seq = DNA_reverse_complement(seq)
        # break seq onto multiple lines
        seqlines = split_seq(seq, BASES_PER_LINE)
        fa_record = (">%d range=%s:%d-%d strand=%s\n%s" % 
                     (g.tx_id, g.chrom, g.tx_start, g.tx_end, g.strand, 
                      seqlines))
        yield g, fa_record
    logging.info("Used %d/%d gene features" % (used,total))
    ref_fa.close()
예제 #15
0
def main():
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument("--frag-size-mean", dest="frag_size_mean", 
                        type=float, default=DEFAULT_FRAG_SIZE_MEAN)
    parser.add_argument("--frag-size-sd", dest="frag_size_sd", 
                        type=float, default=DEFAULT_FRAG_SIZE_SD)
    parser.add_argument("--rlen", dest="read_length", 
                        type=int, default=DEFAULT_READ_LENGTH)
    parser.add_argument("--stranded", dest="stranded", action="store_true",
                        default=False)
    parser.add_argument("-n", dest="num_frags", type=int, 
                        default=DEFAULT_NUM_FRAGS)
    parser.add_argument("index_dir")
    parser.add_argument("transcript_exprs_file")
    parser.add_argument("chimera_file")
    parser.add_argument("output_prefix")
    args = parser.parse_args()
    # setup parameters
    transcript_file = os.path.join(args.index_dir, config.TRANSCRIPT_FEATURE_FILE)
    genome_fasta_file = os.path.join(args.index_dir, config.GENOME_FASTA_FILE)
    fastafh = pysam.Fastafile(genome_fasta_file)
    logging.info("Reading transcripts")
    transcript_dict = {}
    for t in TranscriptFeature.parse(open(transcript_file)):
        transcript_dict[str(t.tx_id)] = t
    logging.info("Generating simulated reads")
    generate_simulated_reads(fastafh,
                             transcript_dict,
                             args.transcript_exprs_file, 
                             args.chimera_file, 
                             args.output_prefix,
                             frag_size_mean=args.frag_size_mean,
                             frag_size_sd=args.frag_size_sd,
                             num_frags=args.num_frags,
                             read_length=args.read_length,
                             stranded=args.stranded)
    fastafh.close()
예제 #16
0
def main():
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument('--max-fragment-length',
                        dest="max_fragment_length",
                        type=int,
                        default=config.DEFAULT_MAX_FRAG_LENGTH)
    parser.add_argument('--library',
                        dest="library_type",
                        default=LibraryTypes.FR_UNSTRANDED)
    parser.add_argument('--max-multihits',
                        dest="max_multihits",
                        default=config.DEFAULT_MAX_MULTIHITS)
    parser.add_argument("transcript_file")
    parser.add_argument("input_bam_file")
    parser.add_argument("paired_bam_file")
    parser.add_argument("discordant_bam_file")
    parser.add_argument("unpaired_bam_file")
    parser.add_argument("unmapped_bam_file")
    parser.add_argument("multimap_bam_file")
    parser.add_argument("unresolved_bam_file")
    args = parser.parse_args()
    # read transcript features
    logging.debug("Reading transcript features")
    transcripts = list(TranscriptFeature.parse(open(args.transcript_file)))
    return find_discordant_fragments(transcripts,
                                     args.input_bam_file,
                                     args.paired_bam_file,
                                     args.discordant_bam_file,
                                     args.unpaired_bam_file,
                                     args.unmapped_bam_file,
                                     args.multimap_bam_file,
                                     args.unresolved_bam_file,
                                     max_isize=args.max_fragment_length,
                                     max_multihits=args.max_multihits,
                                     library_type=args.library_type)
예제 #17
0
def run_chimerascan(runconfig):
    """
    main function for running the chimerascan pipeline
    """
    # print a welcome message
    title_string = "Running chimerascan version %s" % (__version__)
    logging.info(title_string)
    logging.info("-" * len(title_string))
    # validate run configuration
    config_passed = runconfig.check_config()
    if not config_passed:
        logging.error("Invalid run configuration, aborting.")
        return config.JOB_ERROR
    # create output dir if it does not exist
    if not os.path.exists(runconfig.output_dir):
        os.makedirs(runconfig.output_dir)
        logging.info("Created output directory: %s" % (runconfig.output_dir))
    # create log dir if it does not exist
    log_dir = os.path.join(runconfig.output_dir, config.LOG_DIR)
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)
        logging.debug("Created directory for log files: %s" % (log_dir))
    # create tmp dir if it does not exist
    tmp_dir = os.path.join(runconfig.output_dir, config.TMP_DIR)
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)
        logging.debug("Created directory for tmp files: %s" % (tmp_dir))
    # write the run config to a file
    xmlstring = runconfig.to_xml()
    runconfig_xml_file = os.path.join(runconfig.output_dir,
                                      config.RUNCONFIG_XML_FILE)
    logging.info("Writing run configuration to XML file: %s" %
                 (runconfig_xml_file))
    fh = open(runconfig_xml_file, "w")
    print >> fh, xmlstring
    fh.close()
    # mask biotypes and references
    mask_biotypes = set()
    if runconfig.mask_biotypes_file:
        logging.info("Reading biotypes mask file")
        mask_biotypes.update(
            [line.strip() for line in open(runconfig.mask_biotypes_file)])
        logging.info("\tread biotypes: %s" % (','.join(sorted(mask_biotypes))))
    mask_rnames = set()
    if runconfig.mask_rnames_file:
        logging.info("Reading references mask file")
        mask_rnames.update(
            [line.strip() for line in open(runconfig.mask_rnames_file)])
        logging.info("\tread references: %s" % (','.join(sorted(mask_rnames))))
    # read transcripts
    logging.info("Reading transcript features")
    transcript_file = os.path.join(runconfig.index_dir,
                                   config.TRANSCRIPT_FEATURE_FILE)
    transcripts = list(TranscriptFeature.parse(open(transcript_file)))
    logging.info("\tread %d transcripts" % (len(transcripts)))
    # setup alignment indexes
    genome_index = os.path.join(runconfig.index_dir, config.GENOME_INDEX)
    transcriptome_index = os.path.join(runconfig.index_dir,
                                       config.TRANSCRIPTOME_INDEX)
    max_transcriptome_hits_file = os.path.join(runconfig.index_dir,
                                               config.MAX_MULTIMAPPING_FILE)
    max_transcriptome_hits = int(
        open(max_transcriptome_hits_file).next().strip())
    # detect read length
    original_read_length = detect_read_length(runconfig.fastq_files[0])
    # minimum fragment length cannot be smaller than the trimmed read length
    trimmed_read_length = (original_read_length - runconfig.trim5 -
                           runconfig.trim3)
    min_fragment_length = max(runconfig.min_fragment_length,
                              trimmed_read_length)
    #
    # Process and inspect the FASTQ files, performing several alterations
    # to the reads:
    #
    # 1) rename them from long string to numbers to save space throughout
    #    the pipeline. also store mapping from read numbers to full names
    #    in a separate file
    # 2) ensure the "/1" and "/2" suffixes exist to denote paired reads
    # 3) convert quality scores to sanger format
    #
    converted_fastq_files = [
        os.path.join(tmp_dir, fq) for fq in config.CONVERTED_FASTQ_FILES
    ]
    read_name_file = os.path.join(tmp_dir, config.READ_NAME_TXT_FILE)
    msg = "Processing FASTQ files"
    skip = all(
        up_to_date(cfq, fq)
        for cfq, fq in zip(converted_fastq_files, runconfig.fastq_files))
    skip = skip and up_to_date(read_name_file, runconfig.fastq_files[0])
    if skip:
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        converted_fastq_prefix = \
            os.path.join(tmp_dir, config.CONVERTED_FASTQ_PREFIX)
        try:
            retcode = process_input_reads(runconfig.fastq_files,
                                          converted_fastq_prefix,
                                          quals=runconfig.quals,
                                          trim5=runconfig.trim5,
                                          trim3=runconfig.trim3)
            if retcode != config.JOB_SUCCESS:
                logging.error("%s step failed" % (msg))
                return config.JOB_ERROR
        except Exception as e:
            logging.info("Cleaning up after error %s" % (str(e)))
            for fq in converted_fastq_files:
                if os.path.isfile(fq):
                    os.remove(fq)
    #
    # Transcriptome alignment step
    #
    # Align to transcriptome in paired-end mode, trying to resolve as many
    # reads as possible.
    #
    transcriptome_bam_file = os.path.join(tmp_dir,
                                          config.TRANSCRIPTOME_BAM_FILE)
    transcriptome_unaligned_path = os.path.join(
        tmp_dir, config.TRANSCRIPTOME_UNALIGNED_PATH)
    transcriptome_unaligned_fastq_files = tuple(
        os.path.join(tmp_dir, fq)
        for fq in config.TRANSCRIPTOME_UNALIGNED_FASTQ_FILES)
    msg = "Aligning paired-end reads to transcriptome"
    if (all(
            up_to_date(transcriptome_bam_file, fq)
            for fq in converted_fastq_files) and all(
                up_to_date(a, b)
                for a, b in zip(transcriptome_unaligned_fastq_files,
                                converted_fastq_files))):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        log_file = os.path.join(log_dir, config.TRANSCRIPTOME_LOG_FILE)
        retcode = bowtie2_align_transcriptome_pe(
            transcriptome_index=transcriptome_index,
            genome_index=genome_index,
            transcript_file=transcript_file,
            fastq_files=converted_fastq_files,
            unaligned_path=transcriptome_unaligned_path,
            bam_file=transcriptome_bam_file,
            log_file=log_file,
            library_type=runconfig.library_type,
            min_fragment_length=min_fragment_length,
            max_fragment_length=runconfig.max_fragment_length,
            max_transcriptome_hits=max_transcriptome_hits,
            num_processors=runconfig.num_processors)
        # cleanup if job failed
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(transcriptome_bam_file):
                os.remove(transcriptome_bam_file)
            for f in transcriptome_unaligned_fastq_files:
                if os.path.exists(f):
                    os.remove(f)
            return config.JOB_ERROR
    #
    # Sort transcriptome reads by position
    #
    msg = "Sorting transcriptome reads"
    sorted_transcriptome_bam_file = os.path.join(
        runconfig.output_dir, config.SORTED_TRANSCRIPTOME_BAM_FILE)
    if (up_to_date(sorted_transcriptome_bam_file, transcriptome_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        sorted_aligned_bam_prefix = os.path.splitext(
            sorted_transcriptome_bam_file)[0]
        pysam.sort("-m", str(int(1e9)), transcriptome_bam_file,
                   sorted_aligned_bam_prefix)
    #
    # Index BAM file
    #
    msg = "Indexing BAM file"
    sorted_transcriptome_bam_index_file = sorted_transcriptome_bam_file + ".bai"
    if (up_to_date(sorted_transcriptome_bam_index_file,
                   sorted_transcriptome_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        pysam.index(sorted_transcriptome_bam_file)
    #
    # Get insert size distribution
    #
    isize_dist_file = os.path.join(runconfig.output_dir,
                                   config.ISIZE_DIST_FILE)
    msg = "Profiling insert size distribution"
    if up_to_date(isize_dist_file, transcriptome_bam_file):
        logging.info("[SKIPPED] %s" % msg)
        isize_dist = InsertSizeDistribution.from_file(
            open(isize_dist_file, "r"))
    else:
        logging.info(msg)
        bamfh = pysam.Samfile(sorted_transcriptome_bam_file, "rb")
        isize_dist = InsertSizeDistribution.from_genome_bam(
            bamfh,
            transcripts,
            min_isize=min_fragment_length,
            max_isize=runconfig.max_fragment_length,
            max_samples=config.ISIZE_MAX_SAMPLES)
        bamfh.close()
        # if not enough samples, use a normal distribution instead
        # of the empirical distribution
        if isize_dist.n < config.ISIZE_MIN_SAMPLES:
            logging.warning("Not enough fragments to sample insert size "
                            "distribution empirically.  Using mean=%d "
                            "stdev=%f instead" %
                            (runconfig.isize_mean, runconfig.isize_stdev))
            isize_dist = InsertSizeDistribution.from_random(
                runconfig.isize_mean,
                runconfig.isize_stdev,
                min_isize=runconfig.min_fragment_length,
                max_isize=runconfig.max_fragment_length,
                samples=config.ISIZE_MAX_SAMPLES)
        isize_dist.to_file(open(isize_dist_file, "w"))
    #
    # Determine ideal segment length automatically
    #
    # log insert size statistics
    logging.info("Insert size samples=%d mean=%f std=%f median=%d mode=%d" %
                 (isize_dist.n, isize_dist.mean(), isize_dist.std(),
                  isize_dist.isize_at_percentile(50.0), isize_dist.mode()))
    # choose a segment length to optimize mapping
    optimal_isize = isize_dist.isize_at_percentile(
        DEFAULT_FRAG_SIZE_SENSITIVITY)
    logging.info("Determining soft-clipped segment length")
    logging.debug("\tInsert size at %f percent of distribution is %d" %
                  (DEFAULT_FRAG_SIZE_SENSITIVITY, optimal_isize))
    optimal_segment_length = int(round(optimal_isize / 3.0))
    logging.debug("\tOptimal segment length is %d/3.0 = %d" %
                  (optimal_isize, optimal_segment_length))
    segment_length = min(optimal_segment_length, trimmed_read_length)
    segment_length = max(config.MIN_SEGMENT_LENGTH, segment_length)
    logging.debug(
        "\tAfter adjusting for min %d and read length %d, final segment length is %d"
        % (config.MIN_SEGMENT_LENGTH, trimmed_read_length, segment_length))
    if runconfig.segment_length is not None:
        logging.debug(
            "\tOverriding auto segment length and using segment length of %d" %
            (runconfig.segment_length))
        segment_length = runconfig.segment_length
    #
    # Genome alignment step
    #
    # Align any unaligned transcriptome reads to genome in paired-end mode.
    # Resolve as many reads as possible.
    #
    genome_bam_file = os.path.join(tmp_dir, config.GENOME_BAM_FILE)
    genome_unaligned_path = os.path.join(tmp_dir, config.GENOME_UNALIGNED_PATH)
    genome_unaligned_fastq_files = tuple(
        os.path.join(tmp_dir, fq)
        for fq in config.GENOME_UNALIGNED_FASTQ_FILES)
    msg = "Realigning unaligned paired-end reads to genome"
    if (all(up_to_date(genome_bam_file, fq) for fq in converted_fastq_files)
            and all(
                up_to_date(a, b) for a, b in zip(genome_unaligned_fastq_files,
                                                 converted_fastq_files))):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        log_file = os.path.join(log_dir, config.GENOME_LOG_FILE)
        retcode = bowtie2_align_pe(
            index=genome_index,
            fastq_files=transcriptome_unaligned_fastq_files,
            unaligned_path=genome_unaligned_path,
            bam_file=genome_bam_file,
            log_file=log_file,
            library_type=runconfig.library_type,
            min_fragment_length=min_fragment_length,
            max_fragment_length=runconfig.max_fragment_length,
            max_hits=max_transcriptome_hits,
            num_processors=runconfig.num_processors)
        # cleanup if job failed
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(genome_bam_file):
                os.remove(genome_bam_file)
            for f in genome_unaligned_fastq_files:
                if os.path.exists(f):
                    os.remove(f)
            return config.JOB_ERROR
    #
    # Realignment step
    #
    # trim and realign all the initially unaligned reads in order to
    # increase sensitivity to detect reads spanning fusion junctions
    #
    realigned_bam_file = os.path.join(tmp_dir, config.REALIGNED_BAM_FILE)
    realigned_log_file = os.path.join(log_dir, config.REALIGNED_LOG_FILE)
    msg = "Trimming and realigning initially unmapped reads"
    if (all(
            up_to_date(realigned_bam_file, fq)
            for fq in genome_unaligned_fastq_files)
            and up_to_date(realigned_bam_file, isize_dist_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        retcode = bowtie2_align_pe_sr(index=transcriptome_index,
                                      transcript_file=transcript_file,
                                      fastq_files=genome_unaligned_fastq_files,
                                      bam_file=realigned_bam_file,
                                      log_file=realigned_log_file,
                                      tmp_dir=tmp_dir,
                                      segment_length=segment_length,
                                      max_hits=max_transcriptome_hits,
                                      num_processors=runconfig.num_processors)
        if retcode != config.JOB_SUCCESS:
            if os.path.exists(realigned_bam_file):
                os.remove(realigned_bam_file)
            return config.JOB_ERROR
    #
    # Find discordant reads
    #
    # iterate through realigned reads and divide them into groups of
    # concordant, discordant within a gene (isoforms), discordant
    # between different genes, and discordant in the genome
    #
    paired_bam_file = os.path.join(tmp_dir, config.PAIRED_BAM_FILE)
    discordant_bam_file = os.path.join(tmp_dir, config.DISCORDANT_BAM_FILE)
    unpaired_bam_file = os.path.join(tmp_dir, config.UNPAIRED_BAM_FILE)
    unmapped_bam_file = os.path.join(tmp_dir, config.UNMAPPED_BAM_FILE)
    multimap_bam_file = os.path.join(tmp_dir, config.MULTIMAP_BAM_FILE)
    unresolved_bam_file = os.path.join(tmp_dir, config.UNRESOLVED_BAM_FILE)
    output_files = (paired_bam_file, discordant_bam_file, unpaired_bam_file,
                    unmapped_bam_file, multimap_bam_file, unresolved_bam_file)
    msg = "Classifying concordant and discordant read pairs"
    if (all(up_to_date(f, realigned_bam_file) for f in output_files)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        retcode = find_discordant_fragments(
            transcripts=transcripts,
            input_bam_file=realigned_bam_file,
            paired_bam_file=paired_bam_file,
            discordant_bam_file=discordant_bam_file,
            unpaired_bam_file=unpaired_bam_file,
            unmapped_bam_file=unmapped_bam_file,
            multimap_bam_file=multimap_bam_file,
            unresolved_bam_file=unresolved_bam_file,
            max_isize=runconfig.max_fragment_length,
            max_multihits=runconfig.max_multihits,
            library_type=runconfig.library_type)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            for f in output_files:
                if os.path.exists(f):
                    os.remove(f)
            return config.JOB_ERROR
    #
    # Convert discordant transcriptome reads to genome coordinates
    #
    discordant_genome_bam_file = os.path.join(
        tmp_dir, config.DISCORDANT_GENOME_BAM_FILE)
    msg = "Converting discordant transcriptome hits to genomic coordinates"
    if (up_to_date(discordant_genome_bam_file, discordant_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        discordant_genome_sam_file = os.path.join(
            tmp_dir, config.DISCORDANT_GENOME_SAM_FILE)
        retcode = transcriptome_to_genome(
            genome_index,
            transcripts,
            input_file=discordant_bam_file,
            output_file=discordant_genome_sam_file,
            library_type=runconfig.library_type,
            input_sam=False,
            output_sam=True)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(discordant_genome_sam_file):
                os.remove(discordant_genome_sam_file)
            return config.JOB_ERROR
        retcode = sam_to_bam(discordant_genome_sam_file,
                             discordant_genome_bam_file)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(discordant_genome_bam_file):
                os.remove(discordant_genome_bam_file)
            return config.JOB_ERROR
        if os.path.exists(discordant_genome_sam_file):
            os.remove(discordant_genome_sam_file)
    #
    # Sort discordant reads by position
    #
    msg = "Sorting discordant BAM file"
    sorted_discordant_genome_bam_file = os.path.join(
        tmp_dir, config.SORTED_DISCORDANT_GENOME_BAM_FILE)
    if (up_to_date(sorted_discordant_genome_bam_file,
                   discordant_genome_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        bam_prefix = os.path.splitext(sorted_discordant_genome_bam_file)[0]
        pysam.sort("-m", str(int(1e9)), discordant_genome_bam_file, bam_prefix)
    #
    # Index BAM file
    #
    msg = "Indexing discordant BAM file"
    sorted_discordant_bam_index_file = sorted_discordant_genome_bam_file + ".bai"
    if (up_to_date(sorted_discordant_bam_index_file,
                   sorted_discordant_genome_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        pysam.index(sorted_discordant_genome_bam_file)
    #
    # Convert unpaired transcriptome reads to genome coordinates
    #
    unpaired_genome_bam_file = os.path.join(tmp_dir,
                                            config.UNPAIRED_GENOME_BAM_FILE)
    msg = "Converting unpaired transcriptome hits to genomic coordinates"
    if (up_to_date(unpaired_genome_bam_file, unpaired_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        unpaired_genome_sam_file = os.path.join(
            tmp_dir, config.UNPAIRED_GENOME_SAM_FILE)
        retcode = transcriptome_to_genome(genome_index,
                                          transcripts,
                                          input_file=unpaired_bam_file,
                                          output_file=unpaired_genome_sam_file,
                                          library_type=runconfig.library_type,
                                          input_sam=False,
                                          output_sam=True)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(unpaired_genome_sam_file):
                os.remove(unpaired_genome_sam_file)
            return config.JOB_ERROR
        retcode = sam_to_bam(unpaired_genome_sam_file,
                             unpaired_genome_bam_file)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(unpaired_genome_bam_file):
                os.remove(unpaired_genome_bam_file)
            return config.JOB_ERROR
        if os.path.exists(unpaired_genome_sam_file):
            os.remove(unpaired_genome_sam_file)
    #
    # Sort unpaired reads by position
    #
    msg = "Sorting unpaired BAM file"
    sorted_unpaired_genome_bam_file = os.path.join(
        tmp_dir, config.SORTED_UNPAIRED_GENOME_BAM_FILE)
    if (up_to_date(sorted_unpaired_genome_bam_file, unpaired_genome_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        bam_prefix = os.path.splitext(sorted_unpaired_genome_bam_file)[0]
        pysam.sort("-m", str(int(1e9)), unpaired_genome_bam_file, bam_prefix)
    #
    # Index BAM file
    #
    msg = "Indexing unpaired BAM file"
    sorted_unpaired_bam_index_file = sorted_unpaired_genome_bam_file + ".bai"
    if (up_to_date(sorted_unpaired_bam_index_file,
                   sorted_unpaired_genome_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        pysam.index(sorted_unpaired_genome_bam_file)
    #
    # Cluster discordant reads into chimera candidates
    #
    cluster_file = os.path.join(tmp_dir, config.DISCORDANT_CLUSTER_FILE)
    cluster_shelve_file = \
        os.path.join(tmp_dir, config.DISCORDANT_CLUSTER_SHELVE_FILE)
    sorted_discordant_genome_cluster_bam_file = \
        os.path.join(runconfig.output_dir,
                     config.SORTED_DISCORDANT_GENOME_CLUSTER_BAM_FILE)
    input_files = (sorted_discordant_genome_bam_file,
                   sorted_unpaired_genome_bam_file)
    output_files = (cluster_file, cluster_shelve_file,
                    sorted_discordant_genome_cluster_bam_file)
    msg = "Clustering discordant reads"
    skip = True
    for input_file in input_files:
        for output_file in output_files:
            skip = skip and up_to_date(output_file, input_file)
    if skip:
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.debug(msg)
        retcode = cluster_discordant_reads(
            discordant_bam_file=sorted_discordant_genome_bam_file,
            unpaired_bam_file=sorted_unpaired_genome_bam_file,
            concordant_bam_file=sorted_transcriptome_bam_file,
            output_bam_file=sorted_discordant_genome_cluster_bam_file,
            cluster_file=cluster_file,
            cluster_shelve_file=cluster_shelve_file)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            for f in output_files:
                if os.path.exists(f):
                    os.remove(f)
    #
    # Pair discordant clusters
    #
    cluster_pair_file = \
        os.path.join(tmp_dir, config.DISCORDANT_CLUSTER_PAIR_FILE)
    msg = "Pairing discordant clusters"
    output_files = (cluster_pair_file, )
    if up_to_date(cluster_pair_file,
                  sorted_discordant_genome_cluster_bam_file):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.debug(msg)
        retcode = pair_discordant_clusters(
            discordant_bam_file=sorted_discordant_genome_cluster_bam_file,
            cluster_pair_file=cluster_pair_file,
            tmp_dir=tmp_dir)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            for f in output_files:
                if os.path.exists(f):
                    os.remove(f)
    #
    # Perform realignment across putative fusion breakpoints
    #
    breakpoint_bam_file = os.path.join(tmp_dir, config.BREAKPOINT_BAM_FILE)
    msg = "Realigning to find breakpoint-spanning reads"
    input_files = (sorted_discordant_genome_bam_file,
                   sorted_unpaired_genome_bam_file, cluster_shelve_file,
                   cluster_pair_file)
    output_files = (breakpoint_bam_file, )
    skip = True
    for inp in input_files:
        for outp in output_files:
            if not up_to_date(outp, inp):
                skip = False
    if skip:
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.debug(msg)
        retcode = realign_across_breakpoints(
            index_dir=runconfig.index_dir,
            discordant_bam_file=sorted_discordant_genome_bam_file,
            unpaired_bam_file=sorted_unpaired_genome_bam_file,
            cluster_shelve_file=cluster_shelve_file,
            cluster_pair_file=cluster_pair_file,
            breakpoint_bam_file=breakpoint_bam_file,
            log_dir=log_dir,
            tmp_dir=tmp_dir,
            num_processors=runconfig.num_processors,
            local_anchor_length=runconfig.local_anchor_length,
            local_multihits=runconfig.local_multihits)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            for f in output_files:
                if os.path.exists(f):
                    os.remove(f)
    #
    # Nominate breakpoint spanning reads (split reads)
    #
    spanning_sam_file = os.path.join(tmp_dir, config.SPANNING_SAM_FILE)
    spanning_bam_file = os.path.join(tmp_dir, config.SPANNING_BAM_FILE)
    spanning_cluster_pair_file = os.path.join(
        tmp_dir, config.SPANNING_CLUSTER_PAIR_FILE)
    msg = "Processing breakpoint-spanning alignments"
    input_files = (breakpoint_bam_file, cluster_shelve_file, cluster_pair_file)
    output_files = (spanning_bam_file, spanning_cluster_pair_file)
    skip = True
    for inp in input_files:
        for outp in output_files:
            if not up_to_date(outp, inp):
                skip = False
    if skip:
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        retcode = process_spanning_alignments(
            cluster_shelve_file=cluster_shelve_file,
            cluster_pair_file=cluster_pair_file,
            bam_file=breakpoint_bam_file,
            output_sam_file=spanning_sam_file,
            output_cluster_pair_file=spanning_cluster_pair_file,
            local_anchor_length=runconfig.local_anchor_length)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            for f in output_files:
                if os.path.exists(f):
                    os.remove(f)
        retcode = sam_to_bam(spanning_sam_file, spanning_bam_file)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(spanning_bam_file):
                os.remove(spanning_bam_file)
            return config.JOB_ERROR
        if os.path.exists(spanning_sam_file):
            os.remove(spanning_sam_file)
    #
    # Sort unpaired reads by position
    #
    msg = "Sorting spanning BAM file"
    sorted_spanning_bam_file = os.path.join(runconfig.output_dir,
                                            config.SORTED_SPANNING_BAM_FILE)
    if (up_to_date(sorted_spanning_bam_file, spanning_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        bam_prefix = os.path.splitext(sorted_spanning_bam_file)[0]
        pysam.sort("-m", str(int(1e9)), spanning_bam_file, bam_prefix)
    #
    # Index BAM file
    #
    msg = "Indexing spanning BAM file"
    sorted_spanning_bam_index_file = sorted_spanning_bam_file + ".bai"
    if (up_to_date(sorted_spanning_bam_index_file, sorted_spanning_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        pysam.index(sorted_spanning_bam_file)
    #
    # Write chimera file
    #
    unfiltered_chimera_bedpe_file = os.path.join(
        runconfig.output_dir, config.UNFILTERED_CHIMERA_BEDPE_FILE)
    msg = "Writing unfiltered chimeras to file %s" % (
        unfiltered_chimera_bedpe_file)
    if (up_to_date(unfiltered_chimera_bedpe_file, spanning_cluster_pair_file)
            and up_to_date(unfiltered_chimera_bedpe_file,
                           cluster_shelve_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        retcode = write_output(transcripts,
                               cluster_shelve_file=cluster_shelve_file,
                               cluster_pair_file=spanning_cluster_pair_file,
                               read_name_file=read_name_file,
                               output_file=unfiltered_chimera_bedpe_file,
                               annotation_source="ensembl")
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(unfiltered_chimera_bedpe_file):
                os.remove(unfiltered_chimera_bedpe_file)
    #
    # Filter chimeras
    #
    chimera_bedpe_file = os.path.join(runconfig.output_dir,
                                      config.CHIMERA_BEDPE_FILE)
    msg = "Filtering chimeras"
    if (up_to_date(chimera_bedpe_file, unfiltered_chimera_bedpe_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        retcode = filter_chimeras(
            input_file=unfiltered_chimera_bedpe_file,
            output_file=chimera_bedpe_file,
            filter_num_frags=runconfig.filter_num_frags,
            filter_allele_fraction=runconfig.filter_allele_fraction,
            mask_biotypes=mask_biotypes,
            mask_rnames=mask_rnames)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(chimera_bedpe_file):
                os.remove(chimera_bedpe_file)
    #
    # Cleanup
    #
    if not runconfig.keep_tmp:
        logging.info("Cleaning up temporary files")
        shutil.rmtree(tmp_dir)
    #
    # Done
    #
    logging.info("Finished run.")
    return config.JOB_SUCCESS
def build_transcriptome_annotation(gtf_files, output_file):
    # read gtf files and store transcripts
    transcripts = []
    for itm in gtf_files:
        fields = itm.split(",")
        filename,source = fields[0],None
        if len(fields) > 1:
            source = fields[1]
        logging.info("Reading gene features from %s (source=%s)" % (filename, source))
        for t in TranscriptFeature.from_gtf(open(filename), source=source):
            transcripts.append(t)
    logging.debug("\tRead %d annotations from %d files" % (len(transcripts), len(gtf_files)))
    # cluster transcripts by chromosome/strand/position
    logging.info("Determining transcript clusters")
    cur_transcript_id = 1
    cur_cluster_id = 1
    chrom_transcript_clusters = collections.defaultdict(lambda: collections.defaultdict(lambda: []))
    for cluster in cluster_transcripts(transcripts):
        for t in cluster:
            t.tx_id = cur_transcript_id
            t.cluster_id = cur_cluster_id
            chrom_transcript_clusters[t.chrom][(t.introns,t.cluster_id)].append(t)
            cur_transcript_id += 1
        cur_cluster_id += 1
    logging.info("Found %d transcript clusters" % (cur_cluster_id))
    # merge genes in transcript clusters
    logging.info("Merging transcripts")
    outfh = open(output_file, "w")
    cur_transcript_id = 1
    for chrom in sorted(chrom_transcript_clusters):
        transcript_clusters = chrom_transcript_clusters[chrom]
        new_transcripts = []
        for cluster in transcript_clusters.itervalues():
            t = TranscriptFeature()
            t.chrom = chrom
            t.tx_start = min(x.tx_start for x in cluster)
            t.tx_end = max(x.tx_end for x in cluster)
            t.cluster_id = cluster[0].cluster_id 
            t.strand = cluster[0].strand
            t.exon_count = cluster[0].exon_count
            t.exons = list(cluster[0].exons)
            t.exons[0] = (t.tx_start, t.exons[0][1])
            t.exons[-1] = (t.exons[-1][0], t.tx_end)
            t.gene_biotype = "na"
            for x in cluster:
                if x.gene_biotype != "na":
                    t.gene_biotype = x.gene_biotype
                t.tx_names.extend(x.tx_names)
                t.gene_names.extend(x.gene_names)
                t.annotation_sources.extend(x.annotation_sources)
            new_transcripts.append(t)
        new_transcripts.sort(key=operator.attrgetter("tx_start"))
        for t in new_transcripts:
            t.tx_id = cur_transcript_id
            cur_transcript_id += 1
            print >>outfh, str(t)
    outfh.close()
    logging.info("Wrote gene annotation file")
def nominate_chimeras(index_dir, isize_dist_file, input_file, output_file, 
                      trim_bp, max_read_length, homology_mismatches):
    # read insert size distribution
    isize_dist = InsertSizeDistribution.from_file(open(isize_dist_file))
    # build a lookup table to get genomic intervals from transcripts
    logging.debug("Reading transcript information")
    transcript_feature_file = os.path.join(index_dir, config.TRANSCRIPT_FEATURE_FILE)    
    transcripts = list(TranscriptFeature.parse(open(transcript_feature_file)))
    tx_id_map = build_transcript_map(transcripts)
    # open the reference sequence fasta file
    ref_fasta_file = os.path.join(index_dir, config.TRANSCRIPTOME_FASTA_FILE)
    ref_fa = pysam.Fastafile(ref_fasta_file)
    # keep track of mapping from breakpoint sequence to breakpoint id
    # this requires storing all breakpoint sequences in memory which is
    # potentially expensive.  TODO: investigate whether this should be
    # moved to a separate sort-update-sort procedure
    breakpoint_seq_name_map = {}
    breakpoint_num = 1
    # group discordant read pairs by gene
    logging.debug("Parsing discordant reads")
    chimera_num = 1
    outfh = open(output_file, "w")    
    for tx_id_5p, tx_id_3p, frags in parse_discordant_bedpe_by_transcript_pair(open(input_file)):
        # get gene information
        tx5p = tx_id_map[tx_id_5p]
        tx3p = tx_id_map[tx_id_3p]
        # bin fragments into putative breakpoints
        breakpoint_dict = collections.defaultdict(lambda: [])
        for dr5p,dr3p in frags:
            # given the insert size find the highest probability 
            # exon junction breakpoint between the two transcripts
            isize_prob, breakpoints = \
                choose_best_breakpoints(dr5p, dr3p, tx5p, tx3p, 
                                        trim_bp, isize_dist)
            for breakpoint in breakpoints:
                breakpoint_dict[breakpoint].append((dr5p, dr3p))        
        # iterate through breakpoints and build chimera candidates
        for breakpoint,frags in breakpoint_dict.iteritems():          
            exon_num_5p, tx_end_5p, exon_num_3p, tx_start_3p = breakpoint
            breakpoint_seq_5p, breakpoint_seq_3p, homology_left, homology_right = \
                extract_breakpoint_sequence(tx_id_5p, tx_end_5p,
                                            tx_id_3p, tx_start_3p,
                                            ref_fa, max_read_length,
                                            homology_mismatches)                
            tx3p_length = sum((end - start) for start,end in tx3p.exons)
            # get unique breakpoint id based on sequence
            breakpoint_seq = breakpoint_seq_5p + breakpoint_seq_3p
            if breakpoint_seq in breakpoint_seq_name_map:
                breakpoint_name = breakpoint_seq_name_map[breakpoint_seq]
            else:
                breakpoint_name = "B%07d" % (breakpoint_num)
                breakpoint_seq_name_map[breakpoint_seq] = breakpoint_name
                breakpoint_num += 1
            # write gene, breakpoint, and raw reads to a file and follow the
            # BEDPE format
            gene_names_5p = ",".join(sorted(set(["_".join(x.split()) for x in tx5p.gene_names])))
            gene_names_3p = ",".join(sorted(set(["_".join(x.split()) for x in tx3p.gene_names])))
            fields = [tx5p.tx_id, 0, tx_end_5p,  # chrom1, start1, end1
                      tx3p.tx_id, tx_start_3p, tx3p_length, # chrom2, start2, end2
                      "C%07d" % (chimera_num), # name
                      1.0, # pvalue
                      tx5p.strand, tx3p.strand, # strand1, strand2
                      gene_names_5p, gene_names_3p, # gene names
                      # exon interval information
                      '%d-%d' % (0, exon_num_5p),
                      '%d-%d' % (exon_num_3p, len(tx3p.exons)),
                      # breakpoint information
                      breakpoint_name, 
                      breakpoint_seq_5p, breakpoint_seq_3p, 
                      homology_left, homology_right, 
                      # fragments
                      frags_to_encomp_string(frags),
                      # spanning reads
                      None]
            print >>outfh, '\t'.join(map(str, fields))
            chimera_num += 1
    outfh.close()
    ref_fa.close()
    return config.JOB_SUCCESS
예제 #20
0
def write_output(input_file, bam_file, output_file, index_dir):
    # read transcripts
    logging.debug("Reading transcripts")
    transcript_file = os.path.join(index_dir, config.TRANSCRIPT_FEATURE_FILE)
    transcripts = list(TranscriptFeature.parse(open(transcript_file)))
    # build a lookup table to get genome coordinates from transcript 
    # coordinates
    transcript_genome_map = build_transcript_genome_map(transcripts)
    tx_id_map = build_transcript_map(transcripts)
    genome_tx_trees = build_genome_transcript_trees(transcripts)
    # open BAM file for checking wild-type isoform
    bamfh = pysam.Samfile(bam_file, "rb")   
    # group chimera isoforms together
    lines = []
    chimera_clusters = 0
    for key,chimeras in get_chimera_groups(input_file, tx_id_map):
        txs5p = set()
        txs3p = set()
        genes5p = set()
        genes3p = set()
        names = set()
        for c in chimeras:
            txs5p.add("%s:%d-%d" % (c.tx_name_5p, c.tx_start_5p, c.tx_end_5p-1))
            txs3p.add("%s:%d-%d" % (c.tx_name_3p, c.tx_start_3p, c.tx_end_3p-1))
            genes5p.add(c.gene_name_5p)
            genes3p.add(c.gene_name_3p)
            names.add(c.name)
        c = get_best_coverage_chimera(chimeras)
        # get chimera type and distance between genes
        chimera_type, distance = get_chimera_type(tx_id_map[c.tx_name_5p],
                                                  tx_id_map[c.tx_name_3p],
                                                  genome_tx_trees)
        # get genomic positions of chimera
        chrom5p,strand5p,start5p = transcript_to_genome_pos(c.tx_name_5p, c.tx_start_5p, transcript_genome_map)
        chrom5p,strand5p,end5p = transcript_to_genome_pos(c.tx_name_5p, c.tx_end_5p-1, transcript_genome_map)
        if strand5p == 1:
            start5p,end5p = end5p,start5p
        chrom3p,strand3p,start3p = transcript_to_genome_pos(c.tx_name_3p, c.tx_start_3p, transcript_genome_map)
        chrom3p,strand3p,end3p = transcript_to_genome_pos(c.tx_name_3p, c.tx_end_3p-1, transcript_genome_map)
        if strand3p == 1:
            start3p,end3p = end3p,start3p
        # get breakpoint spanning sequences
        spanning_seqs = set()
        spanning_fasta_lines = []
        for dr in c.get_spanning_reads():
            if dr.seq in spanning_seqs:
                continue
            spanning_seqs.add(dr.seq)
            spanning_fasta_lines.extend([">%s/%d;pos=%d;strand=%s" % 
                                         (dr.qname, dr.readnum+1, dr.pos, 
                                          "-" if dr.is_reverse else "+"), 
                                         dr.seq])
        # get isoform fraction
        num_wt_frags_5p, num_wt_frags_3p = get_wildtype_frags(c, bamfh)
        num_chimeric_frags = c.get_num_frags()
        frac5p = float(num_chimeric_frags) / (num_chimeric_frags + num_wt_frags_5p)
        frac3p = float(num_chimeric_frags) / (num_chimeric_frags + num_wt_frags_3p)
        # setup fields of BEDPE file
        fields = [chrom5p, start5p, end5p,
                  chrom3p, start3p, end3p,
                  "CLUSTER%d" % (chimera_clusters),
                  c.get_num_frags(),
                  "+" if (strand5p == 0) else "-",
                  "+" if (strand3p == 0) else "-",
                  ','.join(txs5p),
                  ','.join(txs3p),
                  ','.join(genes5p),
                  ','.join(genes3p),
                  chimera_type, distance,
                  c.get_num_frags(),
                  c.get_num_spanning_frags(),
                  c.get_num_unique_positions(),
                  frac5p, frac3p,
                  ','.join(spanning_fasta_lines),
                  ','.join(names)]
        lines.append(fields)
        chimera_clusters += 1
    bamfh.close()
    logging.debug("Clustered chimeras: %d" % (chimera_clusters))
    # sort
    lines = sorted(lines, key=operator.itemgetter(18, 17, 16), reverse=True)    
    f = open(output_file, "w")
    print >>f, '\t'.join(['#chrom5p', 'start5p', 'end5p', 
                          'chrom3p', 'start3p', 'end3p',
                          'chimera_cluster_id', 'score', 
                          'strand5p', 'strand3p',
                          'transcript_ids_5p', 'transcript_ids_3p',
                          'genes5p', 'genes3p',
                          'type', 'distance',
                          'total_frags', 
                          'spanning_frags',
                          'unique_alignment_positions',
                          'isoform_fraction_5p',
                          'isoform_fraction_3p',
                          'breakpoint_spanning_reads',
                          'chimera_ids'])
    for fields in lines:
        print >>f, '\t'.join(map(str, fields))
    f.close()
    return config.JOB_SUCCESS
예제 #21
0
def run_chimerascan(runconfig):
    """
    main function for running the chimerascan pipeline
    """
    # print a welcome message
    title_string = "Running chimerascan version %s" % (__version__)
    logging.info(title_string)
    logging.info("-" * len(title_string))
    # validate run configuration
    config_passed = runconfig.check_config()
    if not config_passed:
        logging.error("Invalid run configuration, aborting.")
        return config.JOB_ERROR
    # create output dir if it does not exist
    if not os.path.exists(runconfig.output_dir):
        os.makedirs(runconfig.output_dir)
        logging.info("Created output directory: %s" % (runconfig.output_dir))
    # create log dir if it does not exist
    log_dir = os.path.join(runconfig.output_dir, config.LOG_DIR)
    if not os.path.exists(log_dir):
        os.makedirs(log_dir)
        logging.debug("Created directory for log files: %s" % (log_dir))        
    # create tmp dir if it does not exist
    tmp_dir = os.path.join(runconfig.output_dir, config.TMP_DIR)
    if not os.path.exists(tmp_dir):
        os.makedirs(tmp_dir)
        logging.debug("Created directory for tmp files: %s" % (tmp_dir))
    # write the run config to a file
    xmlstring = runconfig.to_xml()
    runconfig_xml_file = os.path.join(runconfig.output_dir, config.RUNCONFIG_XML_FILE)
    logging.info("Writing run configuration to XML file: %s" % (runconfig_xml_file))
    fh = open(runconfig_xml_file, "w")
    print >>fh, xmlstring
    fh.close()
    # mask biotypes and references
    mask_biotypes = set()
    if runconfig.mask_biotypes_file:
        logging.info("Reading biotypes mask file")
        mask_biotypes.update([line.strip() for line in open(runconfig.mask_biotypes_file)])
        logging.info("\tread biotypes: %s" % (','.join(sorted(mask_biotypes))))
    mask_rnames = set()
    if runconfig.mask_rnames_file:
        logging.info("Reading references mask file")
        mask_rnames.update([line.strip() for line in open(runconfig.mask_rnames_file)])
        logging.info("\tread references: %s" % (','.join(sorted(mask_rnames))))
    # read transcripts
    logging.info("Reading transcript features")
    transcript_file = os.path.join(runconfig.index_dir, config.TRANSCRIPT_FEATURE_FILE)
    transcripts = list(TranscriptFeature.parse(open(transcript_file)))
    logging.info("\tread %d transcripts" % (len(transcripts)))
    # setup alignment indexes
    genome_index = os.path.join(runconfig.index_dir, config.GENOME_INDEX)
    transcriptome_index = os.path.join(runconfig.index_dir, config.TRANSCRIPTOME_INDEX)
    max_transcriptome_hits_file = os.path.join(runconfig.index_dir, 
                                               config.MAX_MULTIMAPPING_FILE)
    max_transcriptome_hits = int(open(max_transcriptome_hits_file).next().strip())
    # detect read length
    original_read_length = detect_read_length(runconfig.fastq_files[0])
    # minimum fragment length cannot be smaller than the trimmed read length
    trimmed_read_length = (original_read_length - runconfig.trim5 - runconfig.trim3)
    min_fragment_length = max(runconfig.min_fragment_length, trimmed_read_length)
    # 
    # Process and inspect the FASTQ files, performing several alterations 
    # to the reads:
    #
    # 1) rename them from long string to numbers to save space throughout
    #    the pipeline. also store mapping from read numbers to full names 
    #    in a separate file
    # 2) ensure the "/1" and "/2" suffixes exist to denote paired reads
    # 3) convert quality scores to sanger format
    # 
    converted_fastq_files = [os.path.join(tmp_dir, fq) 
                             for fq in config.CONVERTED_FASTQ_FILES]
    read_name_file = os.path.join(tmp_dir, config.READ_NAME_TXT_FILE)
    msg = "Processing FASTQ files"
    skip = all(up_to_date(cfq, fq) for cfq,fq in 
               zip(converted_fastq_files, runconfig.fastq_files))
    skip = skip and up_to_date(read_name_file, runconfig.fastq_files[0])
    if skip:
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        converted_fastq_prefix = \
            os.path.join(tmp_dir, config.CONVERTED_FASTQ_PREFIX)
        try:
            retcode = process_input_reads(runconfig.fastq_files, 
                                          converted_fastq_prefix,
                                          quals=runconfig.quals,
                                          trim5=runconfig.trim5,
                                          trim3=runconfig.trim3)
            if retcode != config.JOB_SUCCESS:
                logging.error("%s step failed" % (msg))
                return config.JOB_ERROR
        except Exception as e:
            logging.info("Cleaning up after error %s" % (str(e)))
            for fq in converted_fastq_files:
                if os.path.isfile(fq):
                    os.remove(fq)
    #
    # Transcriptome alignment step
    #
    # Align to transcriptome in paired-end mode, trying to resolve as many 
    # reads as possible.
    #
    transcriptome_bam_file = os.path.join(tmp_dir, config.TRANSCRIPTOME_BAM_FILE)
    transcriptome_unaligned_path = os.path.join(tmp_dir, config.TRANSCRIPTOME_UNALIGNED_PATH)
    transcriptome_unaligned_fastq_files = tuple(os.path.join(tmp_dir, fq) for fq in config.TRANSCRIPTOME_UNALIGNED_FASTQ_FILES)
    msg = "Aligning paired-end reads to transcriptome"
    if (all(up_to_date(transcriptome_bam_file, fq) for fq in converted_fastq_files) and 
        all(up_to_date(a,b) for a,b in zip(transcriptome_unaligned_fastq_files, converted_fastq_files))):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        log_file = os.path.join(log_dir, config.TRANSCRIPTOME_LOG_FILE)
        retcode = bowtie2_align_transcriptome_pe(transcriptome_index=transcriptome_index,
                                                 genome_index=genome_index,
                                                 transcript_file=transcript_file,     
                                                 fastq_files=converted_fastq_files,
                                                 unaligned_path=transcriptome_unaligned_path,
                                                 bam_file=transcriptome_bam_file,
                                                 log_file=log_file,
                                                 library_type=runconfig.library_type,
                                                 min_fragment_length=min_fragment_length,
                                                 max_fragment_length=runconfig.max_fragment_length,
                                                 max_transcriptome_hits=max_transcriptome_hits,
                                                 num_processors=runconfig.num_processors)
        # cleanup if job failed
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(transcriptome_bam_file):
                os.remove(transcriptome_bam_file)
            for f in transcriptome_unaligned_fastq_files:
                if os.path.exists(f):
                    os.remove(f)
            return config.JOB_ERROR
    #
    # Sort transcriptome reads by position
    #
    msg = "Sorting transcriptome reads"
    sorted_transcriptome_bam_file = os.path.join(runconfig.output_dir, 
                                                 config.SORTED_TRANSCRIPTOME_BAM_FILE)
    if (up_to_date(sorted_transcriptome_bam_file, transcriptome_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        sorted_aligned_bam_prefix = os.path.splitext(sorted_transcriptome_bam_file)[0]
        pysam.sort("-m", str(int(1e9)), transcriptome_bam_file, sorted_aligned_bam_prefix)
    #
    # Index BAM file
    #
    msg = "Indexing BAM file"
    sorted_transcriptome_bam_index_file = sorted_transcriptome_bam_file + ".bai"
    if (up_to_date(sorted_transcriptome_bam_index_file, sorted_transcriptome_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        pysam.index(sorted_transcriptome_bam_file)
    #
    # Get insert size distribution
    #
    isize_dist_file = os.path.join(runconfig.output_dir, 
                                   config.ISIZE_DIST_FILE)
    msg = "Profiling insert size distribution"
    if up_to_date(isize_dist_file, transcriptome_bam_file):
        logging.info("[SKIPPED] %s" % msg)
        isize_dist = InsertSizeDistribution.from_file(open(isize_dist_file, "r"))
    else:
        logging.info(msg)
        bamfh = pysam.Samfile(sorted_transcriptome_bam_file, "rb")
        isize_dist = InsertSizeDistribution.from_genome_bam(bamfh, transcripts, 
                                                            min_isize=min_fragment_length, 
                                                            max_isize=runconfig.max_fragment_length, 
                                                            max_samples=config.ISIZE_MAX_SAMPLES)
        bamfh.close()
        # if not enough samples, use a normal distribution instead
        # of the empirical distribution
        if isize_dist.n < config.ISIZE_MIN_SAMPLES:
            logging.warning("Not enough fragments to sample insert size "
                            "distribution empirically.  Using mean=%d "
                            "stdev=%f instead" % 
                            (runconfig.isize_mean, 
                             runconfig.isize_stdev))
            isize_dist = InsertSizeDistribution.from_random(runconfig.isize_mean, 
                                                            runconfig.isize_stdev, 
                                                            min_isize=runconfig.min_fragment_length,
                                                            max_isize=runconfig.max_fragment_length,
                                                            samples=config.ISIZE_MAX_SAMPLES)
        isize_dist.to_file(open(isize_dist_file, "w"))
    #
    # Determine ideal segment length automatically
    #
    # log insert size statistics
    logging.info("Insert size samples=%d mean=%f std=%f median=%d mode=%d" % 
                 (isize_dist.n, isize_dist.mean(), isize_dist.std(), 
                  isize_dist.isize_at_percentile(50.0), isize_dist.mode()))    
    # choose a segment length to optimize mapping
    optimal_isize = isize_dist.isize_at_percentile(DEFAULT_FRAG_SIZE_SENSITIVITY)
    logging.info("Determining soft-clipped segment length")
    logging.debug("\tInsert size at %f percent of distribution is %d" % 
                 (DEFAULT_FRAG_SIZE_SENSITIVITY, optimal_isize))
    optimal_segment_length = int(round(optimal_isize / 3.0))
    logging.debug("\tOptimal segment length is %d/3.0 = %d" % (optimal_isize, optimal_segment_length))
    segment_length = min(optimal_segment_length, trimmed_read_length)
    segment_length = max(config.MIN_SEGMENT_LENGTH, segment_length)
    logging.debug("\tAfter adjusting for min %d and read length %d, final segment length is %d" % 
                 (config.MIN_SEGMENT_LENGTH, trimmed_read_length, segment_length))
    if runconfig.segment_length is not None:
        logging.debug("\tOverriding auto segment length and using segment length of %d" % (runconfig.segment_length))
        segment_length = runconfig.segment_length
    #
    # Genome alignment step
    #
    # Align any unaligned transcriptome reads to genome in paired-end mode.
    # Resolve as many reads as possible.
    #
    genome_bam_file = os.path.join(tmp_dir, config.GENOME_BAM_FILE)
    genome_unaligned_path = os.path.join(tmp_dir, config.GENOME_UNALIGNED_PATH)
    genome_unaligned_fastq_files = tuple(os.path.join(tmp_dir, fq) for fq in config.GENOME_UNALIGNED_FASTQ_FILES)
    msg = "Realigning unaligned paired-end reads to genome"
    if (all(up_to_date(genome_bam_file, fq) for fq in converted_fastq_files) and 
        all(up_to_date(a,b) for a,b in zip(genome_unaligned_fastq_files, converted_fastq_files))):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        log_file = os.path.join(log_dir, config.GENOME_LOG_FILE)
        retcode = bowtie2_align_pe(index=genome_index,
                                   fastq_files=transcriptome_unaligned_fastq_files,
                                   unaligned_path=genome_unaligned_path,
                                   bam_file=genome_bam_file,
                                   log_file=log_file,
                                   library_type=runconfig.library_type,
                                   min_fragment_length=min_fragment_length,
                                   max_fragment_length=runconfig.max_fragment_length,
                                   max_hits=max_transcriptome_hits,
                                   num_processors=runconfig.num_processors)
        # cleanup if job failed
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(genome_bam_file):
                os.remove(genome_bam_file)
            for f in genome_unaligned_fastq_files:
                if os.path.exists(f):
                    os.remove(f)
            return config.JOB_ERROR
    #
    # Realignment step
    #
    # trim and realign all the initially unaligned reads in order to
    # increase sensitivity to detect reads spanning fusion junctions
    #
    realigned_bam_file = os.path.join(tmp_dir, config.REALIGNED_BAM_FILE)
    realigned_log_file = os.path.join(log_dir, config.REALIGNED_LOG_FILE)
    msg = "Trimming and realigning initially unmapped reads"
    if (all(up_to_date(realigned_bam_file, fq) for fq in genome_unaligned_fastq_files) and
        up_to_date(realigned_bam_file, isize_dist_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        retcode = bowtie2_align_pe_sr(index=transcriptome_index,
                                      transcript_file=transcript_file,
                                      fastq_files=genome_unaligned_fastq_files,
                                      bam_file=realigned_bam_file,
                                      log_file=realigned_log_file,
                                      tmp_dir=tmp_dir,
                                      segment_length=segment_length,
                                      max_hits=max_transcriptome_hits,
                                      num_processors=runconfig.num_processors)
        if retcode != config.JOB_SUCCESS:
            if os.path.exists(realigned_bam_file):
                os.remove(realigned_bam_file)
            return config.JOB_ERROR
    #
    # Find discordant reads
    #
    # iterate through realigned reads and divide them into groups of
    # concordant, discordant within a gene (isoforms), discordant
    # between different genes, and discordant in the genome
    #
    paired_bam_file = os.path.join(tmp_dir, config.PAIRED_BAM_FILE)
    discordant_bam_file = os.path.join(tmp_dir, config.DISCORDANT_BAM_FILE)
    unpaired_bam_file = os.path.join(tmp_dir, config.UNPAIRED_BAM_FILE)
    unmapped_bam_file = os.path.join(tmp_dir, config.UNMAPPED_BAM_FILE)
    multimap_bam_file = os.path.join(tmp_dir, config.MULTIMAP_BAM_FILE)
    unresolved_bam_file = os.path.join(tmp_dir, config.UNRESOLVED_BAM_FILE)
    output_files = (paired_bam_file, discordant_bam_file, unpaired_bam_file,
                    unmapped_bam_file, multimap_bam_file, unresolved_bam_file)
    msg = "Classifying concordant and discordant read pairs"
    if (all(up_to_date(f, realigned_bam_file) for f in output_files)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        retcode = find_discordant_fragments(transcripts=transcripts,
                                            input_bam_file=realigned_bam_file,
                                            paired_bam_file=paired_bam_file,
                                            discordant_bam_file=discordant_bam_file,
                                            unpaired_bam_file=unpaired_bam_file,
                                            unmapped_bam_file=unmapped_bam_file,
                                            multimap_bam_file=multimap_bam_file,
                                            unresolved_bam_file=unresolved_bam_file,
                                            max_isize=runconfig.max_fragment_length,
                                            max_multihits=runconfig.max_multihits,
                                            library_type=runconfig.library_type)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            for f in output_files:
                if os.path.exists(f):
                    os.remove(f)
            return config.JOB_ERROR
    #
    # Convert discordant transcriptome reads to genome coordinates
    #
    discordant_genome_bam_file = os.path.join(tmp_dir, config.DISCORDANT_GENOME_BAM_FILE)
    msg = "Converting discordant transcriptome hits to genomic coordinates"
    if (up_to_date(discordant_genome_bam_file, discordant_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)        
        discordant_genome_sam_file = os.path.join(tmp_dir, config.DISCORDANT_GENOME_SAM_FILE)
        retcode = transcriptome_to_genome(genome_index, transcripts, 
                                          input_file=discordant_bam_file, 
                                          output_file=discordant_genome_sam_file,
                                          library_type=runconfig.library_type,
                                          input_sam=False,
                                          output_sam=True)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(discordant_genome_sam_file):
                os.remove(discordant_genome_sam_file)
            return config.JOB_ERROR
        retcode = sam_to_bam(discordant_genome_sam_file, discordant_genome_bam_file)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(discordant_genome_bam_file):
                os.remove(discordant_genome_bam_file)
            return config.JOB_ERROR
        if os.path.exists(discordant_genome_sam_file):
            os.remove(discordant_genome_sam_file)
    #
    # Sort discordant reads by position
    #
    msg = "Sorting discordant BAM file"
    sorted_discordant_genome_bam_file = os.path.join(tmp_dir, config.SORTED_DISCORDANT_GENOME_BAM_FILE)
    if (up_to_date(sorted_discordant_genome_bam_file, discordant_genome_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        bam_prefix = os.path.splitext(sorted_discordant_genome_bam_file)[0]
        pysam.sort("-m", str(int(1e9)), discordant_genome_bam_file, bam_prefix)
    #
    # Index BAM file
    #
    msg = "Indexing discordant BAM file"
    sorted_discordant_bam_index_file = sorted_discordant_genome_bam_file + ".bai"
    if (up_to_date(sorted_discordant_bam_index_file, sorted_discordant_genome_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        pysam.index(sorted_discordant_genome_bam_file)
    #
    # Convert unpaired transcriptome reads to genome coordinates
    #
    unpaired_genome_bam_file = os.path.join(tmp_dir, config.UNPAIRED_GENOME_BAM_FILE)
    msg = "Converting unpaired transcriptome hits to genomic coordinates"
    if (up_to_date(unpaired_genome_bam_file, unpaired_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)        
        unpaired_genome_sam_file = os.path.join(tmp_dir, config.UNPAIRED_GENOME_SAM_FILE)
        retcode = transcriptome_to_genome(genome_index, transcripts, 
                                          input_file=unpaired_bam_file, 
                                          output_file=unpaired_genome_sam_file,
                                          library_type=runconfig.library_type,
                                          input_sam=False,
                                          output_sam=True)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(unpaired_genome_sam_file):
                os.remove(unpaired_genome_sam_file)
            return config.JOB_ERROR
        retcode = sam_to_bam(unpaired_genome_sam_file, unpaired_genome_bam_file)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(unpaired_genome_bam_file):
                os.remove(unpaired_genome_bam_file)
            return config.JOB_ERROR
        if os.path.exists(unpaired_genome_sam_file):
            os.remove(unpaired_genome_sam_file)        
    #
    # Sort unpaired reads by position
    #
    msg = "Sorting unpaired BAM file"
    sorted_unpaired_genome_bam_file = os.path.join(tmp_dir, config.SORTED_UNPAIRED_GENOME_BAM_FILE)
    if (up_to_date(sorted_unpaired_genome_bam_file, unpaired_genome_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        bam_prefix = os.path.splitext(sorted_unpaired_genome_bam_file)[0]
        pysam.sort("-m", str(int(1e9)), unpaired_genome_bam_file, bam_prefix)
    #
    # Index BAM file
    #
    msg = "Indexing unpaired BAM file"
    sorted_unpaired_bam_index_file = sorted_unpaired_genome_bam_file + ".bai"
    if (up_to_date(sorted_unpaired_bam_index_file, sorted_unpaired_genome_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        pysam.index(sorted_unpaired_genome_bam_file)
    #
    # Cluster discordant reads into chimera candidates
    #
    cluster_file = os.path.join(tmp_dir, config.DISCORDANT_CLUSTER_FILE)
    cluster_shelve_file = \
        os.path.join(tmp_dir, config.DISCORDANT_CLUSTER_SHELVE_FILE)
    sorted_discordant_genome_cluster_bam_file = \
        os.path.join(runconfig.output_dir, 
                     config.SORTED_DISCORDANT_GENOME_CLUSTER_BAM_FILE)
    input_files = (sorted_discordant_genome_bam_file, 
                   sorted_unpaired_genome_bam_file)
    output_files = (cluster_file, cluster_shelve_file,                      
                    sorted_discordant_genome_cluster_bam_file)
    msg = "Clustering discordant reads"
    skip = True
    for input_file in input_files:
        for output_file in output_files:
            skip = skip and up_to_date(output_file, input_file)
    if skip:
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.debug(msg)
        retcode = cluster_discordant_reads(discordant_bam_file=sorted_discordant_genome_bam_file, 
                                           unpaired_bam_file=sorted_unpaired_genome_bam_file, 
                                           concordant_bam_file=sorted_transcriptome_bam_file, 
                                           output_bam_file=sorted_discordant_genome_cluster_bam_file, 
                                           cluster_file=cluster_file,
                                           cluster_shelve_file=cluster_shelve_file)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            for f in output_files:
                if os.path.exists(f):
                    os.remove(f)
    #
    # Pair discordant clusters
    #
    cluster_pair_file = \
        os.path.join(tmp_dir, config.DISCORDANT_CLUSTER_PAIR_FILE)
    msg = "Pairing discordant clusters"
    output_files = (cluster_pair_file,)
    if up_to_date(cluster_pair_file, sorted_discordant_genome_cluster_bam_file):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.debug(msg)
        retcode = pair_discordant_clusters(discordant_bam_file=sorted_discordant_genome_cluster_bam_file, 
                                           cluster_pair_file=cluster_pair_file, 
                                           tmp_dir=tmp_dir)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            for f in output_files:
                if os.path.exists(f):
                    os.remove(f)
    #
    # Perform realignment across putative fusion breakpoints
    #
    breakpoint_bam_file = os.path.join(tmp_dir, config.BREAKPOINT_BAM_FILE)
    msg = "Realigning to find breakpoint-spanning reads"
    input_files = (sorted_discordant_genome_bam_file, 
                   sorted_unpaired_genome_bam_file, 
                   cluster_shelve_file, 
                   cluster_pair_file)
    output_files = (breakpoint_bam_file,)
    skip = True
    for inp in input_files:
        for outp in output_files:
            if not up_to_date(outp, inp):
                skip = False
    if skip:
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.debug(msg)
        retcode = realign_across_breakpoints(index_dir=runconfig.index_dir,
                                             discordant_bam_file=sorted_discordant_genome_bam_file,
                                             unpaired_bam_file=sorted_unpaired_genome_bam_file,
                                             cluster_shelve_file=cluster_shelve_file,
                                             cluster_pair_file=cluster_pair_file,
                                             breakpoint_bam_file=breakpoint_bam_file,
                                             log_dir=log_dir,
                                             tmp_dir=tmp_dir,
                                             num_processors=runconfig.num_processors,
                                             local_anchor_length=runconfig.local_anchor_length,
                                             local_multihits=runconfig.local_multihits)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            for f in output_files:
                if os.path.exists(f):
                    os.remove(f)
    #
    # Nominate breakpoint spanning reads (split reads)
    #
    spanning_sam_file = os.path.join(tmp_dir, config.SPANNING_SAM_FILE)
    spanning_bam_file = os.path.join(tmp_dir, config.SPANNING_BAM_FILE)
    spanning_cluster_pair_file = os.path.join(tmp_dir, config.SPANNING_CLUSTER_PAIR_FILE)
    msg = "Processing breakpoint-spanning alignments"
    input_files = (breakpoint_bam_file,
                   cluster_shelve_file, 
                   cluster_pair_file)
    output_files = (spanning_bam_file,
                    spanning_cluster_pair_file)
    skip = True
    for inp in input_files:
        for outp in output_files:
            if not up_to_date(outp, inp):
                skip = False
    if skip:
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        retcode = process_spanning_alignments(cluster_shelve_file=cluster_shelve_file,
                                              cluster_pair_file=cluster_pair_file,
                                              bam_file=breakpoint_bam_file,                                              
                                              output_sam_file=spanning_sam_file,
                                              output_cluster_pair_file=spanning_cluster_pair_file,
                                              local_anchor_length=runconfig.local_anchor_length)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            for f in output_files:
                if os.path.exists(f):
                    os.remove(f)
        retcode = sam_to_bam(spanning_sam_file, spanning_bam_file)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(spanning_bam_file):
                os.remove(spanning_bam_file)
            return config.JOB_ERROR
        if os.path.exists(spanning_sam_file):
            os.remove(spanning_sam_file)
    #
    # Sort unpaired reads by position
    #
    msg = "Sorting spanning BAM file"
    sorted_spanning_bam_file = os.path.join(runconfig.output_dir, config.SORTED_SPANNING_BAM_FILE)
    if (up_to_date(sorted_spanning_bam_file, spanning_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        bam_prefix = os.path.splitext(sorted_spanning_bam_file)[0]
        pysam.sort("-m", str(int(1e9)), spanning_bam_file, bam_prefix)
    #
    # Index BAM file
    #
    msg = "Indexing spanning BAM file"
    sorted_spanning_bam_index_file = sorted_spanning_bam_file + ".bai"
    if (up_to_date(sorted_spanning_bam_index_file, sorted_spanning_bam_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        pysam.index(sorted_spanning_bam_file)
    #
    # Write chimera file
    # 
    unfiltered_chimera_bedpe_file = os.path.join(runconfig.output_dir, 
                                                 config.UNFILTERED_CHIMERA_BEDPE_FILE)
    msg = "Writing unfiltered chimeras to file %s" % (unfiltered_chimera_bedpe_file)
    if (up_to_date(unfiltered_chimera_bedpe_file, spanning_cluster_pair_file) and
        up_to_date(unfiltered_chimera_bedpe_file, cluster_shelve_file)):                
        logging.info("[SKIPPED] %s" % (msg))
    else:
        logging.info(msg)
        retcode = write_output(transcripts, 
                               cluster_shelve_file=cluster_shelve_file, 
                               cluster_pair_file=spanning_cluster_pair_file, 
                               read_name_file=read_name_file, 
                               output_file=unfiltered_chimera_bedpe_file, 
                               annotation_source="ensembl")
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(unfiltered_chimera_bedpe_file):
                os.remove(unfiltered_chimera_bedpe_file)
    #
    # Filter chimeras
    #
    chimera_bedpe_file = os.path.join(runconfig.output_dir, config.CHIMERA_BEDPE_FILE)
    msg = "Filtering chimeras"
    if (up_to_date(chimera_bedpe_file, unfiltered_chimera_bedpe_file)):
        logging.info("[SKIPPED] %s" % (msg))
    else:        
        logging.info(msg)
        retcode = filter_chimeras(input_file=unfiltered_chimera_bedpe_file, 
                                  output_file=chimera_bedpe_file,
                                  filter_num_frags=runconfig.filter_num_frags,
                                  filter_allele_fraction=runconfig.filter_allele_fraction,
                                  mask_biotypes=mask_biotypes,
                                  mask_rnames=mask_rnames)
        if retcode != config.JOB_SUCCESS:
            logging.error("[FAILED] %s" % (msg))
            if os.path.exists(chimera_bedpe_file):
                os.remove(chimera_bedpe_file)
    #
    # Cleanup
    # 
    if not runconfig.keep_tmp:
        logging.info("Cleaning up temporary files")
        shutil.rmtree(tmp_dir)
    #
    # Done
    # 
    logging.info("Finished run.")
    return config.JOB_SUCCESS
예제 #22
0
def nominate_chimeras(index_dir, isize_dist_file, input_file, output_file,
                      trim_bp, max_read_length, homology_mismatches):
    # read insert size distribution
    isize_dist = InsertSizeDistribution.from_file(open(isize_dist_file))
    # build a lookup table to get genomic intervals from transcripts
    logging.debug("Reading transcript information")
    transcript_feature_file = os.path.join(index_dir,
                                           config.TRANSCRIPT_FEATURE_FILE)
    transcripts = list(TranscriptFeature.parse(open(transcript_feature_file)))
    tx_id_map = build_transcript_map(transcripts)
    # open the reference sequence fasta file
    ref_fasta_file = os.path.join(index_dir, config.TRANSCRIPTOME_FASTA_FILE)
    ref_fa = pysam.Fastafile(ref_fasta_file)
    # keep track of mapping from breakpoint sequence to breakpoint id
    # this requires storing all breakpoint sequences in memory which is
    # potentially expensive.  TODO: investigate whether this should be
    # moved to a separate sort-update-sort procedure
    breakpoint_seq_name_map = {}
    breakpoint_num = 1
    # group discordant read pairs by gene
    logging.debug("Parsing discordant reads")
    chimera_num = 1
    outfh = open(output_file, "w")
    for tx_id_5p, tx_id_3p, frags in parse_discordant_bedpe_by_transcript_pair(
            open(input_file)):
        # get gene information
        tx5p = tx_id_map[tx_id_5p]
        tx3p = tx_id_map[tx_id_3p]
        # bin fragments into putative breakpoints
        breakpoint_dict = collections.defaultdict(lambda: [])
        for dr5p, dr3p in frags:
            # given the insert size find the highest probability
            # exon junction breakpoint between the two transcripts
            isize_prob, breakpoints = \
                choose_best_breakpoints(dr5p, dr3p, tx5p, tx3p,
                                        trim_bp, isize_dist)
            for breakpoint in breakpoints:
                breakpoint_dict[breakpoint].append((dr5p, dr3p))
        # iterate through breakpoints and build chimera candidates
        for breakpoint, frags in breakpoint_dict.iteritems():
            exon_num_5p, tx_end_5p, exon_num_3p, tx_start_3p = breakpoint
            breakpoint_seq_5p, breakpoint_seq_3p, homology_left, homology_right = \
                extract_breakpoint_sequence(tx_id_5p, tx_end_5p,
                                            tx_id_3p, tx_start_3p,
                                            ref_fa, max_read_length,
                                            homology_mismatches)
            tx3p_length = sum((end - start) for start, end in tx3p.exons)
            # get unique breakpoint id based on sequence
            breakpoint_seq = breakpoint_seq_5p + breakpoint_seq_3p
            if breakpoint_seq in breakpoint_seq_name_map:
                breakpoint_name = breakpoint_seq_name_map[breakpoint_seq]
            else:
                breakpoint_name = "B%07d" % (breakpoint_num)
                breakpoint_seq_name_map[breakpoint_seq] = breakpoint_name
                breakpoint_num += 1
            # write gene, breakpoint, and raw reads to a file and follow the
            # BEDPE format
            gene_names_5p = ",".join(
                sorted(set(["_".join(x.split()) for x in tx5p.gene_names])))
            gene_names_3p = ",".join(
                sorted(set(["_".join(x.split()) for x in tx3p.gene_names])))
            fields = [
                tx5p.tx_id,
                0,
                tx_end_5p,  # chrom1, start1, end1
                tx3p.tx_id,
                tx_start_3p,
                tx3p_length,  # chrom2, start2, end2
                "C%07d" % (chimera_num),  # name
                1.0,  # pvalue
                tx5p.strand,
                tx3p.strand,  # strand1, strand2
                gene_names_5p,
                gene_names_3p,  # gene names
                # exon interval information
                '%d-%d' % (0, exon_num_5p),
                '%d-%d' % (exon_num_3p, len(tx3p.exons)),
                # breakpoint information
                breakpoint_name,
                breakpoint_seq_5p,
                breakpoint_seq_3p,
                homology_left,
                homology_right,
                # fragments
                frags_to_encomp_string(frags),
                # spanning reads
                None
            ]
            print >> outfh, '\t'.join(map(str, fields))
            chimera_num += 1
    outfh.close()
    ref_fa.close()
    return config.JOB_SUCCESS
예제 #23
0
def build_transcriptome_annotation(gtf_files, output_file):
    # read gtf files and store transcripts
    transcripts = []
    for itm in gtf_files:
        fields = itm.split(",")
        filename, source = fields[0], None
        if len(fields) > 1:
            source = fields[1]
        logging.info("Reading gene features from %s (source=%s)" %
                     (filename, source))
        for t in TranscriptFeature.from_gtf(open(filename), source=source):
            transcripts.append(t)
    logging.debug("\tRead %d annotations from %d files" %
                  (len(transcripts), len(gtf_files)))
    # cluster transcripts by chromosome/strand/position
    logging.info("Determining transcript clusters")
    cur_transcript_id = 1
    cur_cluster_id = 1
    chrom_transcript_clusters = collections.defaultdict(
        lambda: collections.defaultdict(lambda: []))
    for cluster in cluster_transcripts(transcripts):
        for t in cluster:
            t.tx_id = cur_transcript_id
            t.cluster_id = cur_cluster_id
            chrom_transcript_clusters[t.chrom][(t.introns,
                                                t.cluster_id)].append(t)
            cur_transcript_id += 1
        cur_cluster_id += 1
    logging.info("Found %d transcript clusters" % (cur_cluster_id))
    # merge genes in transcript clusters
    logging.info("Merging transcripts")
    outfh = open(output_file, "w")
    cur_transcript_id = 1
    for chrom in sorted(chrom_transcript_clusters):
        transcript_clusters = chrom_transcript_clusters[chrom]
        new_transcripts = []
        for cluster in transcript_clusters.itervalues():
            t = TranscriptFeature()
            t.chrom = chrom
            t.tx_start = min(x.tx_start for x in cluster)
            t.tx_end = max(x.tx_end for x in cluster)
            t.cluster_id = cluster[0].cluster_id
            t.strand = cluster[0].strand
            t.exon_count = cluster[0].exon_count
            t.exons = list(cluster[0].exons)
            t.exons[0] = (t.tx_start, t.exons[0][1])
            t.exons[-1] = (t.exons[-1][0], t.tx_end)
            t.gene_biotype = "na"
            for x in cluster:
                if x.gene_biotype != "na":
                    t.gene_biotype = x.gene_biotype
                t.tx_names.extend(x.tx_names)
                t.gene_names.extend(x.gene_names)
                t.annotation_sources.extend(x.annotation_sources)
            new_transcripts.append(t)
        new_transcripts.sort(key=operator.attrgetter("tx_start"))
        for t in new_transcripts:
            t.tx_id = cur_transcript_id
            cur_transcript_id += 1
            print >> outfh, str(t)
    outfh.close()
    logging.info("Wrote gene annotation file")