Example #1
0
def find_first_orfs(gtf_file, genome_fasta_file):
    #
    # extract transcript DNA sequences, translate to protein, and
    # search for first ORF
    #
    logging.debug('Finding ORFs in transcript sequences')
    # open genome fasta file
    ref_fa = pysam.Fastafile(genome_fasta_file)
    num_finished = 1
    for locus_transcripts in parse_gtf(open(gtf_file)):
        for t in locus_transcripts:
            if t.strand == NO_STRAND:
                t.strand = POS_STRAND
                orfpos = find_first_orf(t, ref_fa)
                t.strand = NEG_STRAND
                orfneg = find_first_orf(t, ref_fa)
                if len(orfpos.seq) >= len(orfneg.seq):
                    yield orfpos
                else:
                    yield orfneg
            else:
                yield find_first_orf(t, ref_fa)
            if (num_finished % 10000) == 0:
                logging.debug('Processed %d transcripts' % (num_finished))
            num_finished += 1
    # cleanup
    ref_fa.close()
    return
def main():
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument('gtf_file')
    args = parser.parse_args()
    gtf_file = args.gtf_file
    if not os.path.exists(gtf_file):
        parser.error("GTF file '%s' not found" % (gtf_file))
    # parse transcripts
    num_transcripts = 0
    # keep track of redundant gene/transcript counts
    gene_map = collections.defaultdict(lambda: {})
    transcript_map = collections.defaultdict(lambda: {})
    for transcripts in parse_gtf(open(gtf_file)):
        for t in transcripts:
            ttype, tcat, tname = impute_transcript(t, gene_map, transcript_map)
            # write new attributes
            for f in t.to_gtf_features(source='assemblyline', score=1000):
                f.attrs['transcript_type'] = ttype
                f.attrs['transcript_category'] = tcat
                f.attrs['transcript_name'] = tname
                print str(f)
            num_transcripts += 1
    return 0
Example #3
0
def main():
    # setup logging
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    # parse command line
    parser = argparse.ArgumentParser()
    parser.add_argument('-g',
                        dest='gene_id',
                        action='store_true',
                        default=False)
    parser.add_argument('gtf_file')
    args = parser.parse_args()
    # check command line parameters
    if not os.path.exists(args.gtf_file):
        parser.error("gtf file %s not found" % (args.gtf_file))
    for transcripts in parse_gtf(open(args.gtf_file)):
        for t in transcripts:
            if args.gene_id:
                name = '%s|%s' % (t.attrs['gene_id'], t.attrs['transcript_id'])
            else:
                name = t.attrs['transcript_id']
            fields = write_bed(t.chrom, name, t.strand, 1000, t.exons)
            print '\t'.join(fields)
    return 0
Example #4
0
def find_first_orfs(gtf_file, genome_fasta_file):
    #
    # extract transcript DNA sequences, translate to protein, and
    # search for first ORF
    #
    logging.debug('Finding ORFs in transcript sequences')
    # open genome fasta file
    ref_fa = pysam.Fastafile(genome_fasta_file)
    num_finished = 1
    for locus_transcripts in parse_gtf(open(gtf_file)):
        for t in locus_transcripts:
            if t.strand == NO_STRAND:
                t.strand = POS_STRAND
                orfpos = find_first_orf(t, ref_fa)
                t.strand = NEG_STRAND
                orfneg = find_first_orf(t, ref_fa)
                if len(orfpos.seq) >= len(orfneg.seq):
                    yield orfpos
                else:
                    yield orfneg 
            else:
                yield find_first_orf(t, ref_fa)
            if (num_finished % 10000) == 0:
                logging.debug('Processed %d transcripts' % (num_finished))
            num_finished += 1
    # cleanup
    ref_fa.close()
    return
def main():
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument('gtf_file')
    args = parser.parse_args()
    gtf_file = args.gtf_file
    if not os.path.exists(gtf_file):
        parser.error("GTF file '%s' not found" % (gtf_file))
    # parse transcripts
    num_transcripts = 0
    # keep track of redundant gene/transcript counts
    gene_map = collections.defaultdict(lambda: {})
    transcript_map = collections.defaultdict(lambda: {})
    for transcripts in parse_gtf(open(gtf_file)):
        for t in transcripts:
            ttype, tcat, tname = impute_transcript(t, gene_map, transcript_map)
            # write new attributes
            for f in t.to_gtf_features(source='assemblyline', score=1000):
                f.attrs['transcript_type'] = ttype
                f.attrs['transcript_category'] = tcat 
                f.attrs['transcript_name'] = tname
                print str(f)
            num_transcripts += 1
    return 0
def write_transcript_table(gtf_file, table_file):
    fileh = open(table_file, 'w')
    print >> fileh, '\t'.join(get_classify_header_fields())
    for transcripts in parse_gtf(open(gtf_file)):
        for t in transcripts:
            fields = get_classify_fields(t)
            print >> fileh, '\t'.join(map(str, fields))
    fileh.close()
def write_transcript_table(gtf_file, table_file):
    fileh = open(table_file, "w")
    print >> fileh, "\t".join(get_classify_header_fields())
    for transcripts in parse_gtf(open(gtf_file)):
        for t in transcripts:
            fields = get_classify_fields(t)
            print >> fileh, "\t".join(map(str, fields))
    fileh.close()
def main():
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    # parse command line
    parser = argparse.ArgumentParser()
    parser.add_argument('-u', type=int, dest='upstream', default=1000)
    parser.add_argument('-d', type=int, dest='downstream', default=0)
    parser.add_argument('gtf_file')
    parser.add_argument('chrom_sizes')
    args = parser.parse_args()
    upstream = args.upstream
    downstream = args.downstream
    chrom_sizes_file = args.chrom_sizes
    # check command line parameters
    if not os.path.exists(args.gtf_file):
        parser.error("GTF file %s not found" % (args.gtf_file))
    chrom_sizes = {}
    with open(chrom_sizes_file) as fileh:
        for line in fileh:
            fields = line.strip().split('\t')
            chrom_sizes[fields[0]] = int(fields[1])
    # parse
    for locus_transcripts in parse_gtf(open(args.gtf_file)):
        locus_chrom = locus_transcripts[0].chrom
        locus_start = locus_transcripts[0].start
        locus_end = max(t.end for t in locus_transcripts)
        logging.debug(
            "[LOCUS] %s:%d-%d %d transcripts" %
            (locus_chrom, locus_start, locus_end, len(locus_transcripts)))
        tss_ids = set()
        for t in locus_transcripts:
            if t.strand == NO_STRAND:
                continue
            tss_id = t.attrs['tss_id']
            if tss_id in tss_ids:
                continue
            tss_ids.add(tss_id)
            if t.strand == POS_STRAND:
                start = t.exons[0].start - upstream
                start = max(0, start)
                end = t.exons[0].start + downstream
                end = min(t.end, end)
            else:
                start = t.exons[-1].end - downstream
                start = max(t.start, start)
                end = t.exons[-1].end + upstream
                end = min(end, chrom_sizes[locus_chrom])
            print '\t'.join(
                map(str, [
                    locus_chrom, start, end, tss_id, 0,
                    strand_int_to_str(t.strand)
                ]))

    return 0
Example #9
0
def parse_gtf_tss(gtf_file):
    tss_dict = {}
    for locus_transcripts in parse_gtf(open(gtf_file)):
        for t in locus_transcripts:            
            if t.strand == NO_STRAND:
                continue
            tss_id = t.attrs['tss_id']
            if tss_id in tss_dict:
                continue
            tss_dict[tss_id] = (t.chrom, t.strand, t.start, t.end)
    return tss_dict
def main():
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument('gtf_file')
    parser.add_argument('chrom_sizes')
    parser.add_argument("output_prefix")
    args = parser.parse_args()
    # read one locus at a time
    locus_file = args.output_prefix + '.locus.bed'
    intergenic_file = args.output_prefix + '.intergenic.bed'
    intron_file = args.output_prefix + '.intron.bed'
    locus_fileh = open(locus_file, 'w')
    introns = set()
    logging.info('Parsing transcripts by locus')
    for locus_transcripts in parse_gtf(open(args.gtf_file)):
        # find borders of locus
        locus_chrom = locus_transcripts[0].chrom
        locus_start = min(t.start for t in locus_transcripts)
        locus_end = max(t.end for t in locus_transcripts)
        print >> locus_fileh, '\t'.join(
            [locus_chrom, str(locus_start),
             str(locus_end)])
        logging.debug(
            "[LOCUS] %s:%d-%d %d transcripts" %
            (locus_chrom, locus_start, locus_end, len(locus_transcripts)))
        # cluster locus exons
        cluster_tree = ClusterTree(0, 1)
        for t in locus_transcripts:
            # update locus
            for e in t.exons:
                cluster_tree.insert(e.start, e.end, 1)
        exon_clusters = []
        for start, end, indexes in cluster_tree.getregions():
            exon_clusters.append((start, end))
        # get intronic regions
        e1 = exon_clusters[0]
        for j in xrange(1, len(exon_clusters)):
            e2 = exon_clusters[j]
            introns.add((locus_chrom, e1[1], e2[0]))
            e1 = e2
    locus_fileh.close()
    # write introns
    logging.info('Writing introns')
    intron_fileh = open(intron_file, 'w')
    for chrom, start, end in sorted(introns):
        print >> intron_fileh, '\t'.join([chrom, str(start), str(end)])
    intron_fileh.close()
    # take complement of locus file to find intergenic regions
    logging.info('Complementing locus intervals to find intergenic regions')
    args = ['bedtools', 'complement', '-i', locus_file, '-g', args.chrom_sizes]
    with open(intergenic_file, 'w') as f:
        subprocess.call(args, stdout=f)
def main():
    # parse command line
    parser = argparse.ArgumentParser()
    parser.add_argument("-v", "--verbose", action="store_true", 
                        dest="verbose", default=False)
    parser.add_argument("ref_gtf_file")
    parser.add_argument("gtf_file")
    args = parser.parse_args()
    # set logging level
    if args.verbose:
        level = logging.DEBUG
    else:
        level = logging.INFO
    logging.basicConfig(level=level,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    # check command line parameters
    if not os.path.exists(args.ref_gtf_file):
        parser.error("GTF file %s not found" % (args.ref_gtf_file))
    if not os.path.exists(args.gtf_file):
        parser.error("GTF file %s not found" % (args.gtf_file))
    logging.info("AssemblyLine %s" % (assemblyline.__version__))
    logging.info("----------------------------------")   
    # show parameters
    logging.info("Parameters:")
    logging.info("verbose logging:       %s" % (args.verbose))
    logging.info("ref gtf file:          %s" % (args.ref_gtf_file))
    logging.info("assembly gtf file:     %s" % (args.gtf_file))
    # find CDS regions
    if not os.path.exists('tmp.srt.gtf'):
        with open('tmp.gtf', 'w') as outfileh:
            logging.info("Reading CDS regions from reference GTF")
            for f in get_cds_features(args.ref_gtf_file):
                print >>outfileh, str(f)
            logging.info("Reading transcripts from assembly GTF")
            i = 0
            for f in GTFFeature.parse(open(args.gtf_file)):
                print >>outfileh, str(f)
                i += 1
                if i % 100000 == 0:
                    logging.debug("Parsed %d transcripts" % (i))
        logging.info("Sorting GTF file")
        sort_gtf('tmp.gtf', 'tmp.srt.gtf')
    for locus_transcripts in parse_gtf(open('tmp.srt.gtf')):
        locus_chrom = locus_transcripts[0].chrom
        locus_start = locus_transcripts[0].start
        locus_end = max(t.end for t in locus_transcripts)
        logging.debug("[LOCUS] %s:%d-%d %d transcripts" % 
                      (locus_chrom, locus_start, locus_end, 
                       len(locus_transcripts)))
        for start, end, strand, m, t, c in categorize(locus_transcripts):
            fields = [locus_chrom, str(start), str(end), '%s|%s|%s' % (m,t,c), '0', strand_int_to_str(strand)]
            print '\t'.join(fields)    
    return 0
def main():
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument('gtf_file')
    parser.add_argument('chrom_sizes')
    parser.add_argument("output_prefix")
    args = parser.parse_args()
    # read one locus at a time
    locus_file = args.output_prefix + '.locus.bed'
    intergenic_file = args.output_prefix + '.intergenic.bed'
    intron_file = args.output_prefix + '.intron.bed'
    locus_fileh = open(locus_file, 'w')
    introns = set()
    logging.info('Parsing transcripts by locus')
    for locus_transcripts in parse_gtf(open(args.gtf_file)):
        # find borders of locus
        locus_chrom = locus_transcripts[0].chrom
        locus_start = min(t.start for t in locus_transcripts)
        locus_end = max(t.end for t in locus_transcripts)
        print >>locus_fileh, '\t'.join([locus_chrom, str(locus_start), str(locus_end)])
        logging.debug("[LOCUS] %s:%d-%d %d transcripts" % 
                      (locus_chrom, locus_start, locus_end, 
                       len(locus_transcripts)))
        # cluster locus exons
        cluster_tree = ClusterTree(0,1)
        for t in locus_transcripts:
            # update locus         
            for e in t.exons:
                cluster_tree.insert(e.start, e.end, 1)
        exon_clusters = []
        for start,end,indexes in cluster_tree.getregions():
            exon_clusters.append((start,end))
        # get intronic regions
        e1 = exon_clusters[0]        
        for j in xrange(1, len(exon_clusters)):
            e2 = exon_clusters[j]
            introns.add((locus_chrom, e1[1], e2[0]))
            e1 = e2
    locus_fileh.close()
    # write introns
    logging.info('Writing introns')
    intron_fileh = open(intron_file, 'w')
    for chrom, start,end in sorted(introns):
        print >>intron_fileh, '\t'.join([chrom, str(start), str(end)])
    intron_fileh.close()
    # take complement of locus file to find intergenic regions
    logging.info('Complementing locus intervals to find intergenic regions')
    args = ['bedtools', 'complement', 
            '-i', locus_file,
            '-g', args.chrom_sizes]
    with open(intergenic_file, 'w') as f:
        subprocess.call(args, stdout=f)
def compare_assembly(ref_gtf_file, test_gtf_file, output_dir, 
                     gtf_score_attr, tmp_dir):
    # output files
    compare_file = os.path.join(output_dir, "compare_transcripts.txt")
    global_stats_file = os.path.join(output_dir, "global_stats.txt")
    tmp_gtf_file = os.path.join(output_dir, "tmp.gtf")
    tmp_sorted_gtf_file = os.path.splitext(tmp_gtf_file)[0] + ".srt.gtf"
    # merge and sort ref/test gtf files
    logging.info("Merging reference and test GTF files")
    # make temporary file to store merged ref/test gtf files
    outfh = open(tmp_gtf_file, "w")
    logging.info("Adding reference GTF file")
    add_gtf_file(ref_gtf_file, outfh, is_ref=True, sample_id=None)
    logging.info("Adding test GTF file")
    add_gtf_file(test_gtf_file, outfh, is_ref=False, sample_id='assembly')
    outfh.close()
    logging.info("Sorting merged GTF file")
    sort_gtf(tmp_gtf_file, tmp_sorted_gtf_file, tmp_dir=tmp_dir)
    os.remove(tmp_gtf_file)
    # compare assemblies
    logging.info("Comparing assemblies")
    cmp_fh = open(compare_file, "w")
    print >>cmp_fh, '\t'.join(map(str, MatchStats.header_fields()))
    stats_obj = GlobalStats()
    for locus_transcripts in parse_gtf(open(tmp_sorted_gtf_file)):
        locus_chrom = locus_transcripts[0].chrom
        locus_start = locus_transcripts[0].start
        locus_end = max(t.end for t in locus_transcripts)
        logging.debug("[LOCUS] %s:%d-%d %d transcripts" % 
                      (locus_chrom, locus_start, locus_end, 
                       len(locus_transcripts)))    
        # score transcripts
        for t in locus_transcripts:
            if gtf_score_attr is None:
                t.score = 0.0
            else:
                t.score = float(t.attrs.get(gtf_score_attr, 0.0))
        # run comparison
        for mobj in compare_locus(locus_transcripts):
            print >>cmp_fh, str(mobj)
        # measure global stats
        locus_stats_obj = gather_global_stats(locus_transcripts)
        stats_obj = stats_obj + locus_stats_obj        
    # cleanup
    cmp_fh.close()
    logging.info("Printing report")    
    f = open(global_stats_file, "w")
    print >>f, stats_obj.report()
    os.remove(tmp_sorted_gtf_file)
    logging.info("Done")    
def main():
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    # parse command line
    parser = argparse.ArgumentParser()
    parser.add_argument('-u', type=int, dest='upstream', default=1000)
    parser.add_argument('-d', type=int, dest='downstream', default=0)
    parser.add_argument('gtf_file')
    parser.add_argument('chrom_sizes')
    args = parser.parse_args()
    upstream = args.upstream
    downstream = args.downstream
    chrom_sizes_file = args.chrom_sizes
    # check command line parameters
    if not os.path.exists(args.gtf_file):
        parser.error("GTF file %s not found" % (args.gtf_file))
    chrom_sizes = {}
    with open(chrom_sizes_file) as fileh:
        for line in fileh:
            fields = line.strip().split('\t')
            chrom_sizes[fields[0]] = int(fields[1])
    # parse
    for locus_transcripts in parse_gtf(open(args.gtf_file)):
        locus_chrom = locus_transcripts[0].chrom
        locus_start = locus_transcripts[0].start
        locus_end = max(t.end for t in locus_transcripts)
        logging.debug("[LOCUS] %s:%d-%d %d transcripts" % 
                      (locus_chrom, locus_start, locus_end, 
                       len(locus_transcripts)))
        tss_ids = set()
        for t in locus_transcripts:            
            if t.strand == NO_STRAND:
                continue
            tss_id = t.attrs['tss_id']
            if tss_id in tss_ids:
                continue
            tss_ids.add(tss_id)
            if t.strand == POS_STRAND:
                start = t.exons[0].start - upstream
                start = max(0, start)
                end = t.exons[0].start + downstream
                end = min(t.end, end)
            else:
                start = t.exons[-1].end - downstream
                start = max(t.start, start)
                end = t.exons[-1].end + upstream
                end = min(end, chrom_sizes[locus_chrom])
            print '\t'.join(map(str, [locus_chrom, start, end, tss_id, 0, strand_int_to_str(t.strand)]))

    return 0
Example #15
0
def main():
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument("ref_gtf_file")
    parser.add_argument("gtf_file")
    args = parser.parse_args()
    logging.info("Reading reference GTF file")
    locus_trees = read_reference_gtf(args.ref_gtf_file)
    logging.info("Categorizing test GTF file")
    for locus_transcripts in parse_gtf(open(args.gtf_file)):
        for t in locus_transcripts:        
            categorize_transcript(t, locus_trees)
            for f in t.to_gtf_features():
                print str(f)
def full_transcript_analysis(gtf_file, genome_fasta_file, pfam_dir, output_dir,
                             num_processes):
    # output files
    pfam_file = os.path.join(output_dir, 'full_length_pfam.txt')
    # open genome fasta file
    ref_fa = pysam.Fastafile(genome_fasta_file)
    # convert transcripts to amino acid sequences and write to fasta file
    logging.debug('Writing transcript amino acid sequences to FASTA file(s)')
    tmp_dir = os.path.join(output_dir, 'tmp')
    os.makedirs(tmp_dir)
    fasta_prefix = os.path.join(tmp_dir, 'full')
    fasta_files = []
    fasta_sizes = []
    for i in xrange(num_processes):
        fasta_files.append(open('%s%d.fasta' % (fasta_prefix, i), 'w'))
        fasta_sizes.append(0)
    num_finished = 1
    fasta_file_index = 0
    for locus_transcripts in parse_gtf(open(gtf_file)):
        for t in locus_transcripts:
            # get amino acid sequences in all reading frames
            aa_seqs = translate_transcript(t, ref_fa)
            for frame, aa_seq in enumerate(aa_seqs):
                lines = to_fasta(
                    '%s|frame=%d' % (t.attrs['transcript_id'], frame), aa_seq)
                print >> fasta_files[fasta_file_index], lines
            fasta_sizes[fasta_file_index] += 1
            fasta_file_index = (fasta_file_index + 1) % num_processes
            if (num_finished % 10000) == 0:
                logging.debug('Processed %d transcripts' % (num_finished))
            num_finished += 1
    # get fasta files with lines written
    fasta_file_names = []
    for i in xrange(len(fasta_files)):
        fasta_files[i].close()
        if fasta_sizes[i] > 0:
            fasta_file_names.append(fasta_files[i].name)
    # cleanup
    ref_fa.close()
    #
    # search FASTA file against Pfam
    #
    logging.debug('Scanning for Pfam domains')
    retcode = run_pfam(fasta_file_names, pfam_dir, pfam_file, tmp_dir)
    if retcode != 0:
        logging.error('Error running pfam_scan.pl')
        return retcode
    return 0
def classify_transcripts(classify_dir, num_processors, gtf_score_attr,
                         tmp_dir):
    # setup input and output files
    lib_counts_file = os.path.join(classify_dir, LIB_COUNTS_FILE)
    lib_counts_list = list(LibCounts.from_file(lib_counts_file))
    library_ids = [x.library_id for x in lib_counts_list]
    category_info_dict = {}
    for category_key in CATEGORIES:
        category_str = category_int_to_str[category_key]
        cinfo = CategoryInfo.create(library_ids, category_key, category_str,
                                    classify_dir)
        category_info_dict[category_key] = cinfo
        # write input files for classifier
        logging.info("Writing classification input files category='%s'" %
                     (cinfo.category_str))
        for transcripts in parse_gtf(open(cinfo.output_gtf_file)):
            for t in transcripts:
                # set transcript score
                t.score = float(t.attrs.get(gtf_score_attr, 0.0))
                library_id = t.attrs[GTFAttr.LIBRARY_ID]
                fields = get_classification_fields(t)
                # lookup file handle and open new file if necessary
                if not library_id in cinfo.result_fh_dict:
                    cinfo.result_fh_dict[library_id] = open(
                        cinfo.result_file_dict[library_id], "w")
                    print >> cinfo.result_fh_dict[library_id], '\t'.join(
                        get_classification_header())
                # write to file
                print >> cinfo.result_fh_dict[library_id], '\t'.join(
                    map(str, fields))
        # close open file handles
        for fh in cinfo.result_fh_dict.itervalues():
            fh.close()
    for category_key, cinfo in category_info_dict.iteritems():
        classify_tasks = []
        for lib_counts in lib_counts_list:
            # see if can run classifier on this file
            if lib_counts.category_counts[category_key] > 0:
                filename = cinfo.result_file_dict[lib_counts.library_id]
                classify_tasks.append((lib_counts.library_id, filename))
        # run classification
        logging.info("Classifying transcripts category='%s'" %
                     (cinfo.category_str))
        classify_category(cinfo, classify_tasks, num_processors, tmp_dir)
        # sort results
        sort_classification_results(cinfo.ctree_file, cinfo.sorted_ctree_file,
                                    tmp_dir)
        os.remove(cinfo.ctree_file)
Example #18
0
def main():
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument("gtf_file")
    args = parser.parse_args()    
    for locus_transcripts in parse_gtf(open(args.gtf_file)):
        # group transcripts by gene id
        gene_transcript_map = collections.defaultdict(lambda: [])
        for t in locus_transcripts:
            gene_transcript_map[t.attrs['gene_id']].append(t)
        # categorize genes
        for gene_id, gene_transcripts in gene_transcript_map.iteritems():
            gene_exons = cluster_isoforms(gene_transcripts)
            length = sum((e[1]-e[0]) for e in gene_exons)
            print '\t'.join([gene_id, str(length)])
def full_transcript_analysis(gtf_file, genome_fasta_file, pfam_dir, 
                             output_dir, num_processes):
    # output files    
    pfam_file = os.path.join(output_dir, 'full_length_pfam.txt')
    # open genome fasta file
    ref_fa = pysam.Fastafile(genome_fasta_file)
    # convert transcripts to amino acid sequences and write to fasta file
    logging.debug('Writing transcript amino acid sequences to FASTA file(s)')
    tmp_dir = os.path.join(output_dir, 'tmp')
    os.makedirs(tmp_dir)
    fasta_prefix = os.path.join(tmp_dir, 'full')
    fasta_files = []
    fasta_sizes = []
    for i in xrange(num_processes):
        fasta_files.append(open('%s%d.fasta' % (fasta_prefix, i), 'w'))
        fasta_sizes.append(0)
    num_finished = 1
    fasta_file_index = 0
    for locus_transcripts in parse_gtf(open(gtf_file)):
        for t in locus_transcripts:
            # get amino acid sequences in all reading frames
            aa_seqs = translate_transcript(t, ref_fa)
            for frame, aa_seq in enumerate(aa_seqs):
                lines = to_fasta('%s|frame=%d' % (t.attrs['transcript_id'], frame), aa_seq)
                print >>fasta_files[fasta_file_index], lines
            fasta_sizes[fasta_file_index] += 1
            fasta_file_index = (fasta_file_index + 1) % num_processes
            if (num_finished % 10000) == 0:
                logging.debug('Processed %d transcripts' % (num_finished))
            num_finished += 1
    # get fasta files with lines written
    fasta_file_names = []
    for i in xrange(len(fasta_files)):
        fasta_files[i].close()
        if fasta_sizes[i] > 0:
            fasta_file_names.append(fasta_files[i].name)  
    # cleanup
    ref_fa.close()
    #
    # search FASTA file against Pfam
    #
    logging.debug('Scanning for Pfam domains')
    retcode = run_pfam(fasta_file_names, pfam_dir, pfam_file, tmp_dir)
    if retcode != 0:
        logging.error('Error running pfam_scan.pl')
        return retcode
    return 0
Example #20
0
def main():
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument("gtf_file")
    args = parser.parse_args()
    for locus_transcripts in parse_gtf(open(args.gtf_file)):
        # group transcripts by gene id
        gene_transcript_map = collections.defaultdict(lambda: [])
        for t in locus_transcripts:
            gene_transcript_map[t.attrs['gene_id']].append(t)
        # categorize genes
        for gene_id, gene_transcripts in gene_transcript_map.iteritems():
            gene_exons = cluster_isoforms(gene_transcripts)
            length = sum((e[1] - e[0]) for e in gene_exons)
            print '\t'.join([gene_id, str(length)])
def build_locus_trees(gtf_file):
    transcripts = []
    locus_cluster_trees = collections.defaultdict(lambda: ClusterTree(0,1))    
    for locus_transcripts in parse_gtf(open(gtf_file)):
        for t in locus_transcripts: 
            is_ref = bool(int(t.attrs[GTFAttr.REF]))
            if not is_ref:
                continue
            i = len(transcripts)
            transcripts.append(t)
            locus_cluster_trees[t.chrom].insert(t.start, t.end, i)
    # build interval trees of loci
    locus_trees = collections.defaultdict(lambda: IntervalTree())
    for chrom, cluster_tree in locus_cluster_trees.iteritems():
        for locus_start, locus_end, indexes in cluster_tree.getregions():
            for i in indexes:
                locus_transcripts = [transcripts[i] for i in indexes]
                locus_trees[chrom].insert_interval(Interval(locus_start, locus_end, value=locus_transcripts))
    return locus_trees
Example #22
0
def build_locus_trees(gtf_file):
    transcripts = []
    locus_cluster_trees = collections.defaultdict(lambda: ClusterTree(0,1))
    for locus_transcripts in parse_gtf(open(gtf_file)):
        for t in locus_transcripts:
            is_ref = bool(int(t.attrs[GTFAttr.REF]))
            if not is_ref:
                continue
            i = len(transcripts)
            transcripts.append(t)
            locus_cluster_trees[t.chrom].insert(t.start, t.end, i)
    # build interval trees of loci
    locus_trees = collections.defaultdict(lambda: IntervalTree())
    for chrom, cluster_tree in locus_cluster_trees.iteritems():
        for locus_start, locus_end, indexes in cluster_tree.getregions():
            for i in indexes:
                locus_transcripts = [transcripts[i] for i in indexes]
                locus_trees[chrom].insert_interval(Interval(locus_start, locus_end, value=locus_transcripts))
    return locus_trees
def classify_transcripts(classify_dir, num_processors, gtf_score_attr, 
                         tmp_dir):
    # setup input and output files
    lib_counts_file = os.path.join(classify_dir, LIB_COUNTS_FILE)
    lib_counts_list = list(LibCounts.from_file(lib_counts_file))
    library_ids = [x.library_id for x in lib_counts_list]
    category_info_dict = {}
    for category_key in CATEGORIES:
        category_str = category_int_to_str[category_key]
        cinfo = CategoryInfo.create(library_ids, category_key, 
                                    category_str, classify_dir)
        category_info_dict[category_key] = cinfo
        # write input files for classifier
        logging.info("Writing classification input files category='%s'" % (cinfo.category_str))
        for transcripts in parse_gtf(open(cinfo.output_gtf_file)):
            for t in transcripts:
                # set transcript score
                t.score = float(t.attrs.get(gtf_score_attr, 0.0))
                library_id = t.attrs[GTFAttr.LIBRARY_ID]
                fields = get_classification_fields(t)
                # lookup file handle and open new file if necessary
                if not library_id in cinfo.result_fh_dict:
                    cinfo.result_fh_dict[library_id] = open(cinfo.result_file_dict[library_id], "w")        
                    print >>cinfo.result_fh_dict[library_id], '\t'.join(get_classification_header())
                # write to file
                print >>cinfo.result_fh_dict[library_id], '\t'.join(map(str, fields))        
        # close open file handles
        for fh in cinfo.result_fh_dict.itervalues():
            fh.close()
    for category_key, cinfo in category_info_dict.iteritems():
        classify_tasks = []
        for lib_counts in lib_counts_list:
            # see if can run classifier on this file
            if lib_counts.category_counts[category_key] > 0:
                filename = cinfo.result_file_dict[lib_counts.library_id]
                classify_tasks.append((lib_counts.library_id, filename))
        # run classification
        logging.info("Classifying transcripts category='%s'" % (cinfo.category_str))
        classify_category(cinfo, classify_tasks, num_processors, tmp_dir)
        # sort results
        sort_classification_results(cinfo.ctree_file, cinfo.sorted_ctree_file, tmp_dir)
        os.remove(cinfo.ctree_file)
def main():
    logging.basicConfig(level=logging.DEBUG, format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument("ref_gtf_file")
    parser.add_argument("gtf_file")
    args = parser.parse_args()
    logging.info("Reading reference GTF file")
    locus_trees = read_reference_gtf(args.ref_gtf_file)
    logging.info("Categorizing test GTF file")
    for locus_transcripts in parse_gtf(open(args.gtf_file)):
        # group transcripts by gene id
        gene_transcript_map = collections.defaultdict(lambda: [])
        for t in locus_transcripts:
            gene_transcript_map[t.attrs[GTFAttr.GENE_ID]].append(t)
        # categorize genes
        for gene_transcripts in gene_transcript_map.itervalues():
            categorize_gene_transcripts(gene_transcripts, locus_trees)
            # output transcript
            for t in gene_transcripts:
                for f in t.to_gtf_features():
                    print str(f)
Example #25
0
def main():
    # setup logging
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    # parse command line
    parser = argparse.ArgumentParser()
    parser.add_argument('-g', dest='gene_id', action='store_true', default=False)
    parser.add_argument('gtf_file')
    args = parser.parse_args()
    # check command line parameters
    if not os.path.exists(args.gtf_file):
        parser.error("gtf file %s not found" % (args.gtf_file))
    for transcripts in parse_gtf(open(args.gtf_file)):
        for t in transcripts:
            if args.gene_id:
                name = '%s|%s' % (t.attrs['gene_id'], t.attrs['transcript_id'])
            else:
                name = t.attrs['transcript_id']
            fields = write_bed(t.chrom, name, t.strand, 1000, t.exons)
            print '\t'.join(fields)
    return 0
def main():
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument("gtf_file")
    parser.add_argument("region")
    args = parser.parse_args()
    if not os.path.exists(args.gtf_file):
        parser.error("GTF file %s not found" % (args.gtf_file))
    region_chrom, startend = args.region.split(":")
    start, end = startend.split("-")
    region_start = int(start)
    region_end = int(end)
    for transcripts in parse_gtf(open(args.gtf_file)):
        for t in transcripts:
            if ((t.chrom == region_chrom) and
                (t.start < region_end) and
                (t.end > region_start)):
                features = t.to_gtf_features()
                for f in features:
                    print str(f)
    logging.debug("Done")
    return 0
def main():
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument("ref_gtf_file")
    parser.add_argument("gtf_file")
    args = parser.parse_args()
    logging.info("Reading reference GTF file")
    locus_trees = read_reference_gtf(args.ref_gtf_file)
    logging.info("Categorizing test GTF file")
    for locus_transcripts in parse_gtf(open(args.gtf_file)):
        # group transcripts by gene id
        gene_transcript_map = collections.defaultdict(lambda: [])
        for t in locus_transcripts:
            gene_transcript_map[t.attrs[GTFAttr.GENE_ID]].append(t)
        # categorize genes
        for gene_transcripts in gene_transcript_map.itervalues():
            categorize_gene_transcripts(gene_transcripts, locus_trees)
            # output transcript
            for t in gene_transcripts:
                for f in t.to_gtf_features():
                    print str(f)
Example #28
0
def read_gtf(filename):
    return list(parse_gtf(open(get_gtf_path(filename))))
Example #29
0
def main():
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument('gtf_file')
    parser.add_argument('excl_file')
    parser.add_argument('chrom_sizes')
    parser.add_argument("output_prefix")
    args = parser.parse_args()
    prefix = args.output_prefix
    excl_file = args.excl_file
    chrom_sizes_file = args.chrom_sizes
    gtf_file = args.gtf_file
    # check command line parameters
    if which('bedtools') is None:
        parser.error('bedtools binary not found in PATH')
    if not os.path.exists(chrom_sizes_file):
        parser.error('chrom sizes file %s not found' % (chrom_sizes_file))
    gene_intervals_file = prefix + '.gene_intervals.bed'
    gene_intervals_shuffled_file = prefix + '.gene_intervals.shuffle.bed'
    shuffled_gtf_file = prefix + '.shuffle.gtf'
    sorted_shuffled_gtf_file = prefix + '.shuffle.srt.gtf'
    logging.info('Parsing GTF file')
    with open(gene_intervals_file, 'w') as f:
        for locus_transcripts in parse_gtf(open(gtf_file)):
            # find borders of locus
            locus_chrom = locus_transcripts[0].chrom
            locus_start = min(t.start for t in locus_transcripts)
            locus_end = max(t.end for t in locus_transcripts)
            logging.debug(
                "[LOCUS] %s:%d-%d %d transcripts" %
                (locus_chrom, locus_start, locus_end, len(locus_transcripts)))
            for g in get_gene_intervals(locus_transcripts):
                print >> f, '\t'.join(
                    map(str, [g.chrom, g.start, g.end, g.gene_id]))
    # randomly shuffle genes
    logging.info("Shuffling genes")
    args = [
        'bedtools', 'shuffle', '-excl', excl_file, '-i', gene_intervals_file,
        '-g', args.chrom_sizes
    ]
    with open(gene_intervals_shuffled_file, 'w') as fileh:
        subprocess.call(args, stdout=fileh)
    # read new gene positions
    logging.info("Reading shuffled gene intervals")
    shuffle_gene_map = {}
    with open(gene_intervals_shuffled_file) as fileh:
        for line in fileh:
            fields = line.strip().split('\t')
            chrom = fields[0]
            start = int(fields[1])
            end = int(fields[2])
            gene_id = fields[3]
            shuffle_gene_map[gene_id] = (chrom, start, end)
    # reposition transcripts
    logging.info("Repositioning transcripts")
    with open(shuffled_gtf_file, 'w') as fileh:
        for locus_transcripts in parse_gtf(open(gtf_file)):
            # get original positions
            orig_gene_map = {}
            for g in get_gene_intervals(locus_transcripts):
                orig_gene_map[g.gene_id] = (g.chrom, g.start, g.end)
            for t in locus_transcripts:
                gene_id = t.attrs['gene_id']
                orig_chrom, orig_start, orig_end = orig_gene_map[gene_id]
                if gene_id not in shuffle_gene_map:
                    logging.warning(
                        'Gene %s [%s:%d-%d] could not be shuffled' %
                        (gene_id, orig_chrom, orig_start, orig_end))
                    continue
                new_chrom, new_start, new_end = shuffle_gene_map[gene_id]
                # reposition transcript
                t.chrom = new_chrom
                t.start = new_start + (t.start - orig_start)
                t.end = new_start + (t.end - orig_start)
                for e in t.exons:
                    e.start = new_start + (e.start - orig_start)
                    e.end = new_start + (e.end - orig_start)

                fields = write_bed(t.chrom, t.attrs['transcript_id'], t.strand,
                                   1000, t.exons)
                print '\t'.join(fields)
                #for f in t.to_gtf_features(source='shuffle'):
                #    print >>fileh, str(f)
    logging.info("Sorting GTF file")
    sort_gtf(shuffled_gtf_file, sorted_shuffled_gtf_file)
def main():
    logging.basicConfig(
        level=logging.DEBUG,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument('--rename', dest='rename', action='store_true')
    parser.add_argument('gtf_file')
    args = parser.parse_args()
    gtf_file = args.gtf_file
    rename = args.rename
    if not os.path.exists(gtf_file):
        parser.error("GTF file '%s' not found" % (gtf_file))
    # parse transcripts
    num_transcripts = 0
    # keep track of redundant gene/transcript counts
    gene_map = collections.defaultdict(lambda: {})
    transcript_map = collections.defaultdict(lambda: {})
    for transcripts in parse_gtf(open(gtf_file)):
        for t in transcripts:
            catstr = t.attrs['category']
            catint = Category.to_int(catstr)
            gene_type = t.attrs.get('gene_type', None)
            ref_gene_type = t.attrs['ref_gene_type']
            if catint == Category.SAME_STRAND:
                # impute gene type
                new_gene_type = ref_gene_type
            else:
                if gene_type == 'protein_coding':
                    # don't change protein coding genes
                    new_gene_type = gene_type
                elif t.length < 250:
                    # categorize small RNA separately
                    new_gene_type = 'misc_RNA'
                else:
                    if ref_gene_type == 'protein_coding':
                        # categorize based on overlap with reference
                        new_gene_type = PROTEIN_CATEGORY_MAP[catint]
                    else:
                        # reference is also non-coding
                        new_gene_type = 'lincRNA'
            # get gene category
            gene_category = GENCODE_CATEGORY_MAP[new_gene_type]
            new_gene_name = None
            if rename:
                # resolve upper/lower case issue with gene names from
                # different databases
                ref_gene_name = t.attrs['ref_gene_name'].upper()
                # build new gene name
                if ref_gene_name == 'NONE':
                    new_gene_name = str(t.attrs['source'])
                elif catint == Category.SAME_STRAND:
                    new_gene_name = str(ref_gene_name)
                else:
                    new_gene_name = '%s.%s' % (ref_gene_name, catstr)
                # gene name string is key to a dictionary that
                # associates each gene id with an integer number
                gene_id = t.attrs['gene_id']
                gene_dict = gene_map[new_gene_name]
                if gene_id not in gene_dict:
                    gene_num = len(gene_dict) + 1
                    gene_dict[gene_id] = gene_num
                else:
                    gene_num = gene_dict[gene_id]
                # gene id is also key to dict that associates each isoform
                # of gene with integer number
                t_id = t.attrs['transcript_id']
                t_dict = transcript_map[gene_id]
                if t_id not in t_dict:
                    t_num = len(t_dict) + 1
                    t_dict[t_id] = t_num
                else:
                    t_num = t_dict[t_id]
                # append gene/transcript integers to gene name
                new_gene_name = '%s.%d.%d' % (new_gene_name, gene_num, t_num)
            # write new attributes
            for f in t.to_gtf_features(source='assemblyline', score=1000):
                f.attrs['gene_type'] = new_gene_type
                f.attrs['gene_category'] = gene_category
                if rename:
                    if 'gene_name' in f.attrs:
                        f.attrs['orig_gene_name'] = f.attrs['gene_name']
                    f.attrs['gene_name'] = new_gene_name
                print str(f)
            num_transcripts += 1
    return 0
Example #31
0
def main():
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument("--length", type=int, dest="length", default=250)
    parser.add_argument("--ncrna-exons", type=int, dest="ncrna_exons", default=1)
    parser.add_argument("--pseudogene-exons", type=int, dest="pseudogene_exons", default=1)
    parser.add_argument("--antisense-exons", type=int, dest="antisense_exons", default=2)
    parser.add_argument("--intronic-exons", type=int, dest="intronic_exons", default=2)
    parser.add_argument("--intergenic-exons", type=int, dest="intergenic_exons", default=2)
    parser.add_argument("--intergenic-dist", type=int, dest="intergenic_dist", default=1000)
    parser.add_argument("gtf_file")
    args = parser.parse_args()
    lncrna_categories = set(("intronic", "intergenic", "antisense", "ncrna"))
    logging.debug("Retrieving lncrna features from GTF")
    total_transcripts = 0
    lncrnas = 0
    intergenic = 0
    intronic = 0
    antisense = 0
    pseudogene = 0
    ncrna = 0
    for transcripts in parse_gtf(open(args.gtf_file)):
        total_transcripts += len(transcripts)
        for t in transcripts:
            # throw out protein coding genes
            category = t.attrs["category"]
            if category not in lncrna_categories:
                continue
            # throw out transcripts that overlap certain classes of 
            # transcripts
            annotation_sources = set(t.attrs["annotation_sources"].split(','))
            if not annotation_sources.isdisjoint(IGNORE_SOURCES):
                continue
            # check length requirement
            if t.length < args.length:
                continue            
            if category == "intergenic":
                dist = int(t.attrs["nearest_dist"])
                # exclude "intergenic" lncrnas that are close to known genes
                if dist < args.intergenic_dist:
                    continue
                if len(t.exons) < args.intergenic_exons:
                    continue
                intergenic += 1
            elif category == "intronic":
                if len(t.exons) < args.intronic_exons:
                    continue
                intronic += 1
            elif category == "antisense":
                if len(t.exons) < args.antisense_exons:
                    continue
                antisense += 1
            else:
                # keep multi-exonic pseudogenes
                if not annotation_sources.isdisjoint(PSEUDOGENE_SOURCES):
                    if len(t.exons) < args.pseudogene_exons:
                        continue
                    pseudogene += 1
                else:
                    if len(t.exons) < args.ncrna_exons:
                        continue
                    ncrna += 1
            # output
            for f in t.to_gtf_features():
                print str(f)
            lncrnas += 1
    logging.debug("Read %d lncrna transcripts out of %d total transcripts" % (lncrnas, total_transcripts))
    logging.debug("intergenic: %d" % (intergenic))
    logging.debug("intronic: %d" % (intronic))
    logging.debug("antisense: %d" % (antisense))
    logging.debug("pseudogene: %d" % (pseudogene))
    logging.debug("ncrna: %d" % (ncrna))
def orf_analysis(gtf_file, genome_fasta_file, pfam_dir, 
                 output_dir, min_orf_length, first_orf_only, 
                 num_processes):
    #
    # extract transcript DNA sequences, translate to protein, and
    # search for ORFs
    #
    logging.debug('Finding ORFs in transcript sequences')
    # output files
    tmp_dir = os.path.join(output_dir, 'tmp')
    if not os.path.exists(tmp_dir):
        logging.info("Creating tmp directory '%s'" % (tmp_dir))
        os.makedirs(tmp_dir)
    orf_bed_file = os.path.join(output_dir, 'transcript_orfs.bed')
    unique_orf_file = os.path.join(output_dir, 'unique_orfs.txt')
    unique_orf_bed_file = os.path.join(output_dir, 'unique_orfs.bed')
    orf_file = os.path.join(tmp_dir, 'transcript_orfs.no_ids.txt')
    sorted_orf_file = os.path.join(tmp_dir, 'transcript_orfs.no_ids.sortbyorf.txt')
    sorted_orf_id_file = os.path.join(tmp_dir, 'transcript_orfs.sortbyorf.txt')
    signalp_file = os.path.join(output_dir, 'signalp.txt')
    pfam_file = os.path.join(output_dir, 'pfam.txt')
    merged_orf_id_file = os.path.join(output_dir, 'transcript_orfs.sortbyorf.merged.txt')
    sorted_merged_orf_id_file = os.path.join(output_dir, 'transcript_orfs.sortbytranscript.merged.txt')
    # open output files
    orf_fileh = open(orf_file, 'w')
    orf_bed_fileh = open(orf_bed_file, 'w')
    # open genome fasta file
    ref_fa = pysam.Fastafile(genome_fasta_file)
    num_finished = 1
    for locus_transcripts in parse_gtf(open(gtf_file)):
        for t in locus_transcripts:
            if first_orf_only:
                # get first ORF
                orf = get_first_transcript_orf(t, ref_fa)
                if len(orf.seq) >= min_orf_length:
                    print >>orf_fileh, '\t'.join(orf.to_table())
                    print >>orf_bed_fileh, '\t'.join(orf.to_bed())
            else:
                # get all ORFs
                for orf in get_all_transcript_orfs(t, ref_fa, min_orf_length):
                    print >>orf_fileh, '\t'.join(orf.to_table())
                    print >>orf_bed_fileh, '\t'.join(orf.to_bed())
            if (num_finished % 10000) == 0:
                logging.debug('Processed %d transcripts' % (num_finished))
            num_finished += 1
    # cleanup
    orf_fileh.close()
    orf_bed_fileh.close()
    #
    # sort ORF table by ORF amino acid sequence to group identical ORFs 
    # together
    #
    logging.debug('Sorting ORFs by amino acid sequence')
    def sort_by_seq(line):
        '''comparison function for batch_sort'''
        fields = line.strip().split('\t')
        return fields[ORFInfo.SEQ_COL_NUM]
    sort_tmp_dir = os.path.join(tmp_dir, 'sort_tmp')
    os.makedirs(sort_tmp_dir)
    batch_sort(input=orf_file,
               output=sorted_orf_file,
               key=sort_by_seq,
               buffer_size=SORT_BUFFER_SIZE,
               tempdirs=[sort_tmp_dir])
    shutil.rmtree(sort_tmp_dir)
    #
    # assign each ORF a unique id and write to FASTA file
    #
    logging.debug('Determining unique ORFs')
    orf_fasta_prefix = os.path.join(tmp_dir, 'orf')
    orf_fasta_files = []
    orf_fasta_sizes = []
    for i in xrange(num_processes):
        orf_fasta_files.append(open('%s%d.fasta' % (orf_fasta_prefix, i), 'w'))
        orf_fasta_sizes.append(0)
    orf_file_index = 0
    outfileh = open(sorted_orf_id_file, 'w')
    unique_orf_fileh = open(unique_orf_file, 'w')
    print >>unique_orf_fileh, '\t'.join(['orf_id', 'orf_length', 'total_occurrences', 'unique_genomic_occurrences'])
    unique_orf_bed_fileh = open(unique_orf_bed_file, 'w')
    with open(sorted_orf_file) as infileh:
        for orfs in group_unique_orfs(infileh):
            # write to master transcript/ORF table
            for orf in orfs:
                print >>outfileh, '\t'.join(orf.to_table())
            # write ORF to fasta file
            lines = to_fasta(orfs[0].orf_id, orfs[0].seq.strip('*'))
            print >>orf_fasta_files[orf_file_index], lines
            orf_fasta_sizes[orf_file_index] += 1
            # advance to next fasta file
            orf_file_index = (orf_file_index + 1) % (num_processes)
            # group by genomic position and write ORFs to BED file
            unique_genome_orfs = {}
            for orf in orfs:
                k = (orf.chrom, orf.strand, tuple(orf.exons))
                if k in unique_genome_orfs:
                    continue
                unique_genome_orfs[k] = orf
            for orf in unique_genome_orfs.itervalues():
                print >>unique_orf_bed_fileh, '\t'.join(orf.to_bed(orf.orf_id))
            # write unique ORF to tab-delimited text file
            fields = [orfs[0].orf_id, len(orfs[0].seq), len(orfs), len(unique_genome_orfs)]
            print >>unique_orf_fileh, '\t'.join(map(str,fields))
    # cleanup
    unique_orf_bed_fileh.close()
    outfileh.close()
    # get fasta files with lines written
    orf_fasta_file_names = []
    for i in xrange(len(orf_fasta_files)):
        orf_fasta_files[i].close()
        if orf_fasta_sizes[i] > 0:
            orf_fasta_file_names.append(orf_fasta_files[i].name)
    #
    # search FASTA file against signalp
    #
    logging.debug('Searching for signal peptides')
    retcode = run_signalp(orf_fasta_file_names, signalp_file, tmp_dir)
    if retcode != 0:
        logging.error('Error searching for signal peptides')
        return 1 
    #
    # search FASTA file against Pfam
    #
    logging.error('Scanning for Pfam domains')
    retcode = run_pfam(orf_fasta_file_names, pfam_dir, pfam_file, tmp_dir)
    if retcode != 0:
        logging.error('Error running pfam_scan.pl')
    #
    # merge results from Pfam and signalp
    #
    logging.debug('Merging SignalP and Pfam results')
    merge_results(sorted_orf_id_file, signalp_file, pfam_file, merged_orf_id_file)
    #
    # sort by transcript id
    #
    logging.debug('Sorting ORFs by transcript ID')
    def sort_by_transcript_id(line):
        return line.split('\t', 1)[0]
    sort_tmp_dir = os.path.join(tmp_dir, 'sort_tmp')
    os.makedirs(sort_tmp_dir)
    batch_sort(input=merged_orf_id_file,
               output=sorted_merged_orf_id_file,
               key=sort_by_transcript_id,
               buffer_size=SORT_BUFFER_SIZE,
               tempdirs=[sort_tmp_dir])
    shutil.rmtree(sort_tmp_dir)
    # cleanup
    ref_fa.close()
    return 0
def main():
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument("--assembly", dest = 'assembly_bed',
                    default = intergenic_assembly_bed,
                    help = 'Assembly file used for shuffling and snp overlap intersection')
    parser.add_argument("--snps", dest = 'snps',
                    default = snp_bed,
                    help = 'SNP universe bed file')
    parser.add_argument("--excl", dest = 'excl',
                    default = excl_file,
                    help = 'Exclusion file used for shuffling')
    parser.add_argument("--chrom", dest = 'chrom',
                    default = chrom_sizes_file,
                    help = 'Chrom size file used for shuffling')
    parser.add_argument("--gtf", dest = 'gtf',
                    default = gtf_file,
                    help = 'GTF file used to generate shuffle (should match assembly_bed)')
    parser.add_argument("--gwas", dest = 'gwas',
                    default = gwas_bed,
                    help = 'GWAS bed file file used for intersection')
    parser.add_argument("--flank", dest = 'flank',
                    default = 0,
                    help = 'number of flanking bases to add to bed files')
    args = parser.parse_args()
    
    args.flank = int(args.flank)
    
    logging.info('Output is printed to stdout, to save use \'>\' <filename>')
    
    # check command line parameters
    if which('bedtools') is None:
        parser.error('bedtools binary not found in PATH')
    if not os.path.exists(chrom_sizes_file):
        parser.error('chrom sizes file %s not found' % (chrom_sizes_file))

    if not os.path.isdir('GWAS_TMPS'):
        os.mkdir('GWAS_TMPS')
    
    prefix = 'GWAS_TMPS'    
    gene_intervals_file = os.path.join(prefix, 'gene_intervals.bed')
    intersect_file = os.path.join(prefix, 'intersect.txt')
    assembly_flank = os.path.join(prefix, 'flank.bed')
    
    output_file = 'gwas_intergenic_null.txt'
    
        
    logging.info('Parsing GTF file')
    with open(gene_intervals_file, 'w') as f:
        for locus_transcripts in parse_gtf(open(gtf_file)):
            # find borders of locus
            locus_chrom = locus_transcripts[0].chrom
            locus_start = min(t.start for t in locus_transcripts)
            locus_end = max(t.end for t in locus_transcripts)
#             logging.debug("[LOCUS] %s:%d-%d %d transcripts" % 
#                           (locus_chrom, locus_start, locus_end, 
#                            len(locus_transcripts)))
            for g in get_gene_intervals(locus_transcripts):
                print >>f, '\t'.join(map(str, [g.chrom, g.start, g.end, g.gene_id]))   
                
    #apply flank to the bed file 
    #read chrom file to make sure flanks added do not enter chrom ends
    chrom_length = {}
    for line in open(chrom_sizes_file): 
        line = line.strip().split('\t')
        chr = line[0]
        length = line[1]
        chrom_length[chr] = length
    with open(assembly_flank, 'w') as f:
        for line in open(args.assembly_bed):
            line = line.strip().split('\t')
            chr = line[0]
            start = int(line[1])
            end = int(line[2])
            chr_len = chrom_length[chr]
            start = max(0, (start - args.flank))
            end = min(chr_len, (end + args.flank))
            line[1] = start
            line[2] = end
            print >> f, '\t'.join(map(str, line))
    
    
    #GWAS snps
    #do intersections for real data and report number of overlapping GWAS snps 
    logging.info('Intersecting assembly with GWAS snps')
    args_int = ['bedtools', 'intersect', 
            '-a', args.gwas,
            '-b', assembly_flank,
            '-wa',
            '-wb']
    with open(intersect_file, 'w') as fileh:
        subprocess.call(args_int, stdout=fileh)
    #count number of SNPs caught
    snps = set()
    
    for line in open(intersect_file):
        line = line.strip().split('\t')
        rsID = line[RSIDCOL]
        snps.add(rsID)
    gwas_overlap = len(snps)

    #snp universe
    #do intersections for real data and report number of overlapping snps in snp universe 
    logging.info('Intersecting assembly with snp universe')
    args_int = ['bedtools', 'intersect', 
            '-a', args.snps,
            '-b', assembly_flank,
            '-wa',
            '-wb',
            '-sorted']
    with open(intersect_file, 'w') as fileh:
        subprocess.call(args_int, stdout=fileh)
    #count number of SNPs caught
    snps = set()
    for line in open(intersect_file):
        line = line.strip().split('\t')
        rsID = line[RSIDCOL]
        snps.add(rsID)
    snp_overlap = len(snps)
    frac_real = float(gwas_overlap)/snp_overlap
    logging.info('%d GWAS snps overlap compendia genes'  % gwas_overlap)
    logging.info('%d snps (from \"snp universe\") overlap compendia genes' % snp_overlap)
    logging.info('Frac: %f' % frac_real)
    
    
    print '\t'.join(map(str, [args.flank, gwas_overlap, snp_overlap, frac_real]))
    

    
#     valso.close()
    shutil.rmtree(prefix)
    
    return 0
def main():
    # parse command line
    parser = argparse.ArgumentParser()
    parser.add_argument("-v",
                        "--verbose",
                        action="store_true",
                        dest="verbose",
                        default=False)
    parser.add_argument("ref_gtf_file")
    parser.add_argument("gtf_file")
    args = parser.parse_args()
    # set logging level
    if args.verbose:
        level = logging.DEBUG
    else:
        level = logging.INFO
    logging.basicConfig(
        level=level,
        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    # check command line parameters
    if not os.path.exists(args.ref_gtf_file):
        parser.error("GTF file %s not found" % (args.ref_gtf_file))
    if not os.path.exists(args.gtf_file):
        parser.error("GTF file %s not found" % (args.gtf_file))
    logging.info("AssemblyLine %s" % (assemblyline.__version__))
    logging.info("----------------------------------")
    # show parameters
    logging.info("Parameters:")
    logging.info("verbose logging:       %s" % (args.verbose))
    logging.info("ref gtf file:          %s" % (args.ref_gtf_file))
    logging.info("assembly gtf file:     %s" % (args.gtf_file))
    # find CDS regions
    if not os.path.exists('tmp.srt.gtf'):
        with open('tmp.gtf', 'w') as outfileh:
            logging.info("Reading CDS regions from reference GTF")
            for f in get_cds_features(args.ref_gtf_file):
                print >> outfileh, str(f)
            logging.info("Reading transcripts from assembly GTF")
            i = 0
            for f in GTFFeature.parse(open(args.gtf_file)):
                print >> outfileh, str(f)
                i += 1
                if i % 100000 == 0:
                    logging.debug("Parsed %d transcripts" % (i))
        logging.info("Sorting GTF file")
        sort_gtf('tmp.gtf', 'tmp.srt.gtf')
    for locus_transcripts in parse_gtf(open('tmp.srt.gtf')):
        locus_chrom = locus_transcripts[0].chrom
        locus_start = locus_transcripts[0].start
        locus_end = max(t.end for t in locus_transcripts)
        logging.debug(
            "[LOCUS] %s:%d-%d %d transcripts" %
            (locus_chrom, locus_start, locus_end, len(locus_transcripts)))
        for start, end, strand, m, t, c in categorize(locus_transcripts):
            fields = [
                locus_chrom,
                str(start),
                str(end),
                '%s|%s|%s' % (m, t, c), '0',
                strand_int_to_str(strand)
            ]
            print '\t'.join(fields)
    return 0
Example #35
0
def read_gtf(filename):
    return list(parse_gtf(open(get_gtf_path(filename))))
def main():
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument("--rand_snp", dest = 'rand_snp',
                    default = rand_snps_file,
                    help = 'Bed file of random snps to use as negative control for analyses')
    parser.add_argument("--snps", dest = 'snps',
                    default = snp_bed,
                    help = 'SNP universe bed file')
    parser.add_argument("--excl", dest = 'excl',
                    default = excl_file,
                    help = 'Exclusion file used for shuffling')
    parser.add_argument("--chrom", dest = 'chrom',
                    default = chrom_sizes_file,
                    help = 'Chrom size file used for shuffling')
    parser.add_argument("--gtf", dest = 'gtf',
                    help = 'GTF file used to generate shuffle')
    parser.add_argument("--gwas", dest = 'gwas',
                    default = gwas_bed_file,
                    help = 'GWAS bed file file used for intersection')
    parser.add_argument("--shuffs", dest = 'shuffs',
                    default = 100,
                    help = 'number of shuffles to perform')
    parser.add_argument("-p", dest = 'proc',
                    default = 4,
                    help = 'number of processors to use')
    parser.add_argument("--flank", dest = 'flank',
                    default = 0,
                    help = 'number of flanking bases to add to bed files')
    parser.add_argument("--exon", dest="exon", 
                        action="store_true", default=False, 
                        help="Perform analysis looking only at exonic overlap")
    args = parser.parse_args()
    
    args.proc = int(args.proc)
    args.flank = int(args.flank)
    
    logging.info('Output is printed to stdout')
    if args.exon: 
        logging.info('Looking at exonic overlap only')
    # check command line parameters
    if which('bedtools') is None:
        parser.error('bedtools binary not found in PATH')
    if not os.path.exists(chrom_sizes_file):
        parser.error('chrom sizes file %s not found' % (chrom_sizes_file))

    if not os.path.isdir('GWAS_TMPS'):
        os.mkdir('GWAS_TMPS')
    
    prefix = 'GWAS_TMPS'    
    locus_intervals_file = os.path.join(prefix, 'locus_intervals.bed')
    intersect_file = os.path.join(prefix, 'intersect.txt')
    assembly_flank = os.path.join(prefix, 'flank.bed')
    assembly_flank_sorted = os.path.join(prefix + '.flank.sorted.bed')
    assembly_bed = os.path.join(prefix, 'assembly.bed')
    
    #read chrom file to make sure flanks added do not enter chrom ends
    chrom_length = {}
    for line in open(chrom_sizes_file): 
        line = line.strip().split('\t')
        chr = line[0]
        length = line[1]
        chrom_length[chr] = length
    
    
    #convert GTF file to BED for initial intersections and get locus intervals 
    logging.info('Parsing GTF: converting to BED and obtaining locus intervals')
    with open(assembly_bed, 'w') as f2:
        with open(locus_intervals_file, 'w') as f:
            j = 0
            for locus_transcripts in parse_gtf(open(args.gtf)):
                if (j%2500)==0: 
                    logging.debug('Finished %d/%d loci' % (j, 35000))
                for t in locus_transcripts:
                    name = t.attrs['transcript_id']
                    fields = write_bed(t.chrom, name, t.strand, 1000, t.exons, args.flank, chrom_length)
                    print >>f2, '\t'.join(fields)
                # find borders of locus
                locus_chrom = locus_transcripts[0].chrom
                locus_start = min(t.start for t in locus_transcripts)
                locus_end = max(t.end for t in locus_transcripts)
                locus_id = j
                j+=1
                print >>f, '\t'.join(map(str, [locus_chrom, locus_start, locus_end, locus_id]))
    #apply flank to the bed file 
    with open(assembly_flank, 'w') as f:
        for line in open(assembly_bed):
            line = line.strip().split('\t')
            chr = line[0]
            start = int(line[1])
            end = int(line[2])
            chr_len = chrom_length[chr]
            start = max(0, (start - args.flank))
            end = min(chr_len, (end + args.flank))
            line[1] = start
            line[2] = end
            print >> f, '\t'.join(map(str, line))
    
    
    args_sort = ['sort', 
                 '-k1,1',
                 '-k2,2n', 
                 assembly_flank]
    with open(assembly_flank_sorted, 'w') as fileh:
        subprocess.call(args_sort, stdout=fileh)
    
    
    #GWAS snps
    #do intersections for real data and report number of overlapping GWAS snps 
    logging.info('Intersecting assembly with GWAS snps')
    args_int = ['bedtools', 'intersect', 
            '-a', args.gwas,
            '-b', assembly_flank_sorted,
            '-sorted',
            '-wa',
            '-wb']
    if args.exon:
        args_int.append('-split')
    with open(intersect_file, 'w') as fileh:
        subprocess.call(args_int, stdout=fileh)
    #count number of SNPs caught
    snps = set()
    for line in open(intersect_file):
        line = line.strip().split('\t')
        rsID = line[RSIDCOL]
        snps.add(rsID)
    gwas_overlap = len(snps)

    #Random snps
    #do intersections for real data and report number of overlapping GWAS snps 
    logging.info('Intersecting assembly with random snps')
    args_int = ['bedtools', 'intersect', 
            '-a', args.rand_snp,
            '-b', assembly_flank_sorted,
            '-sorted',
            '-wa',
            '-wb']
    if args.exon:
        args_int.append('-split')
    with open(intersect_file, 'w') as fileh:
        subprocess.call(args_int, stdout=fileh)
    #count number of SNPs caught
    snps = set()
    for line in open(intersect_file):
        line = line.strip().split('\t')
        rsID = line[RSIDCOL]
        snps.add(rsID)
    rand_overlap = len(snps)

    #snp universe
    #do intersections for real data and report number of overlapping snps in snp universe 
    logging.info('Intersecting assembly with snp universe')
    args_int = ['bedtools', 'intersect', 
            '-a', args.snps,
            '-b', assembly_flank_sorted,
            '-sorted',
            '-wa',
            '-wb']
    if args.exon:
        args_int.append('-split')
    with open(intersect_file, 'w') as fileh:
        subprocess.call(args_int, stdout=fileh)
    #count number of SNPs caught
    snps = set()
    for line in open(intersect_file):
        line = line.strip().split('\t')
        rsID = line[RSIDCOL]
        snps.add(rsID)
    snp_overlap = len(snps)
    frac_real = float(gwas_overlap)/snp_overlap
    frac_rand = float(rand_overlap)/snp_overlap
    if args.exon: 
        logging.info('%d GWAS snps overlap compendia exons'  % gwas_overlap)
        logging.info('%d random snps overlap compendia exons'  % rand_overlap)
        logging.info('%d total snps overlap compendia exons' % snp_overlap)
    else: 
        logging.info('%d GWAS snps overlap compendia genes'  % gwas_overlap)
        logging.info('%d random snps overlap compendia genes'  % rand_overlap)
        logging.info('%d total snps overlap compendia genes' % snp_overlap)    
    logging.info('Frac_gwas: %f' % frac_real)
    logging.info('Frac_rand: %f' % frac_rand)
    
    
    
    #loop the shuffle to generate a distribution of nulls for number of snps hit by random intergenic genes
    pool = multiprocessing.Pool(args.proc)
    NUM_SHUFFS = int(args.shuffs)
    shuff_args = (args.snps,
                  args.gwas,
                  args.rand_snp,
                  args.excl,
                  locus_intervals_file,
                  args.chrom,
                  args.gtf,
                  frac_real,
                  frac_rand,
                  gwas_overlap,
                  rand_overlap,
                  snp_overlap,
                  NUM_SHUFFS,
                  prefix,
                  args.flank,
                  args.exon)
    tasks = []
    header = [
              'gwas_shuff_overlap',
              'rand_snp_shuff_overlap',
              'all_snp_shuff_overlap',
              'frac_gwas_shuff',
              'frac_rand_shuff',
              'gwas_overlap',
              'rand_snp_overlap',
              'all_snp_overlap',
              'frac_gwas',
              'frac_rand',
              'OR_gwas',
              'OR_rand'
              ]
    
    print '\t'.join(header)
    logging.info("Beginning shuffles")
    for i in xrange(NUM_SHUFFS):
        tasks.append((i,) + shuff_args)
    result_iter = pool.imap_unordered(shuffle_imap, tasks)
    for line in result_iter:
        print line
    pool.close()
    pool.join()
    
    shutil.rmtree(prefix)
    
    return 0
def shuffle(process,
            snps_file,
            gwas_file,
            excl_file, 
            locus_intervals_file, 
            chrom_sizes_file,
            gtf_file,
            frac_real,
            gwas_real,
            snps_real,
            NUM_SHUFFS,
            output_dir,
            flank):
    x = process
    prefix = 'process' + str(x)
    locus_intervals_shuffled_file = os.path.join(output_dir, prefix + '.locus_intervals.shuffle.bed')
    shuffled_bed_file = os.path.join(output_dir, prefix + '.shuffle.bed')
    intersect_file = os.path.join(output_dir, prefix + '.intersect.txt')
    args_shuff = ['bedtools', 'shuffle', 
            '-excl', excl_file,
            '-i', locus_intervals_file, 
            '-g', chrom_sizes_file]
    with open(locus_intervals_shuffled_file, 'w') as fileh:
        subprocess.call(args_shuff, stdout=fileh)
    # read new gene positions
#     logging.info("Reading shuffled gene intervals")
    shuffle_locus_map = {}
    with open(locus_intervals_shuffled_file) as fileh:
        for line in fileh:
            fields = line.strip().split('\t')
#             print fields
            chrom = fields[0]
            start = int(fields[1])
            end = int(fields[2])
            locus_id = int(fields[3])
            shuffle_locus_map[locus_id] = (chrom, start, end)
    # reposition transcripts
    
    #read chrom file to make sure flanks added do not enter chrom ends
    chrom_length = {}
    for line in open(chrom_sizes_file): 
        line = line.strip().split('\t')
        chr = line[0]
        length = line[1]
        chrom_length[chr] = length
    
    with open(shuffled_bed_file, 'w') as fileh:
        i=0
        for locus_transcripts in parse_gtf(open(gtf_file)):
            locus_chrom = locus_transcripts[0].chrom
            locus_start = min(t.start for t in locus_transcripts)
            locus_end = max(t.end for t in locus_transcripts)
            locus_id = i
            i+=1
            orig_locus_map = {}
            orig_locus_map[locus_id] = (locus_chrom, locus_start, locus_end)
            orig_chrom, orig_start, orig_end = orig_locus_map[locus_id]
            if locus_id not in shuffle_locus_map.keys():
                    logging.warning('Locus %s [%s:%d-%d] could not be shuffled' % (locus_id, orig_chrom, orig_start, orig_end))
                    continue
            
            for t in locus_transcripts:
                new_chrom, new_start, new_end = shuffle_locus_map[locus_id]
                # reposition transcript
                t.chrom = new_chrom
                t.start = new_start + (t.start - orig_start)
                t.end = new_start + (t.end - orig_start)
                for e in t.exons:
                    e.start = new_start + (e.start - orig_start)
                    e.end = new_start + (e.end - orig_start)
                fields = write_bed(t.chrom, t.attrs['transcript_id'], t.strand, 1000, t.exons, flank, chrom_length)
                print >>fileh, '\t'.join(map(str,fields))


    #gwas snps
    #do intersection for shuffle with GWAS snps
#         logging.info('Performing GWAS intersect %d/%d' % (x+1, NUM_SHUFFS))
    args_int = ['bedtools', 'intersect', 
            '-a', gwas_file,
            '-b', shuffled_bed_file,
            '-wa',
            '-wb']
    with open(intersect_file, 'w') as fileh:
        subprocess.call(args_int, stdout=fileh)
    #count number of GWAS SNPs caught
    snps = set()
    for line in open(intersect_file):
        line = line.strip().split('\t')
        rsID = line[RSIDCOL]
        snps.add(rsID)
    val = len(snps)
    
    #snp universe
    #do intersections for shuffle and snp universe 
    args_int = ['bedtools', 'intersect', 
            '-a', snps_file,
            '-b', shuffled_bed_file,
            '-wa',
            '-wb']
    with open(intersect_file, 'w') as fileh:
        subprocess.call(args_int, stdout=fileh)
    #count number of SNPs caught
    snps = set()
    for line in open(intersect_file):
        line = line.strip().split('\t')
        rsID = line[RSIDCOL]
        snps.add(rsID)
    snp_overlap = len(snps)
    frac = float(val)/snp_overlap
    OR = frac_real/frac
    logging.info('Shuffle %d/%d. GWAS: %d, Universe: %d, Fraction: %f, OR: %f'%(x, NUM_SHUFFS, val, snp_overlap, frac, OR))
    return '\t'.join(map(str, [val, snp_overlap, frac, gwas_real, snps_real, frac_real, OR]))
def main():
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument("--assembly", dest = 'assembly_bed',
                    default = intergenic_assembly_bed,
                    help = 'Assembly file used for shuffling and snp overlap intersection')
    parser.add_argument("--snps", dest = 'snps',
                    default = snp_bed,
                    help = 'SNP universe bed file')
    parser.add_argument("--excl", dest = 'excl',
                    default = excl_file,
                    help = 'Exclusion file used for shuffling')
    parser.add_argument("--chrom", dest = 'chrom',
                    default = chrom_sizes_file,
                    help = 'Chrom size file used for shuffling')
    parser.add_argument("--gtf", dest = 'gtf',
                    default = gtf_file,
                    help = 'GTF file used to generate shuffle (should match assembly_bed)')
    parser.add_argument("--gwas", dest = 'gwas',
                    default = gwas_bed,
                    help = 'GWAS bed file file used for intersection')
    parser.add_argument("--shuffs", dest = 'shuffs',
                    default = 100,
                    help = 'number of shuffles to perform')
    parser.add_argument("-p", dest = 'proc',
                    default = 4,
                    help = 'number of processors to use')
    parser.add_argument("--flank", dest = 'flank',
                    default = 0,
                    help = 'number of flanking bases to add to bed files')
    args = parser.parse_args()
    
    args.proc = int(args.proc)
    args.flank = int(args.flank)
    
    logging.info('Output is printed to stdout, to save use \'>\' <filename>')
    
    # check command line parameters
    if which('bedtools') is None:
        parser.error('bedtools binary not found in PATH')
    if not os.path.exists(chrom_sizes_file):
        parser.error('chrom sizes file %s not found' % (chrom_sizes_file))

    if not os.path.isdir('GWAS_TMPS'):
        os.mkdir('GWAS_TMPS')
    
    prefix = 'GWAS_TMPS'    
    locus_intervals_file = os.path.join(prefix, 'locus_intervals.bed')
    intersect_file = os.path.join(prefix, 'intersect.txt')
    assembly_flank = os.path.join(prefix, 'flank.bed')
    
        
    logging.info('Parsing GTF file')
    with open(locus_intervals_file, 'w') as f:
        j = 0
        for locus_transcripts in parse_gtf(open(gtf_file)):
            # find borders of locus
            locus_chrom = locus_transcripts[0].chrom
            locus_start = min(t.start for t in locus_transcripts)
            locus_end = max(t.end for t in locus_transcripts)
            locus_id = j
            j+=1
            print >>f, '\t'.join(map(str, [locus_chrom, locus_start, locus_end, locus_id]))   
    #apply flank to the bed file 
    #read chrom file to make sure flanks added do not enter chrom ends
    chrom_length = {}
    for line in open(chrom_sizes_file): 
        line = line.strip().split('\t')
        chr = line[0]
        length = line[1]
        chrom_length[chr] = length
    with open(assembly_flank, 'w') as f:
        for line in open(args.assembly_bed):
            line = line.strip().split('\t')
            chr = line[0]
            start = int(line[1])
            end = int(line[2])
            chr_len = chrom_length[chr]
            start = max(0, (start - args.flank))
            end = min(chr_len, (end + args.flank))
            line[1] = start
            line[2] = end
            print >> f, '\t'.join(map(str, line))
    
    
    #GWAS snps
    #do intersections for real data and report number of overlapping GWAS snps 
    logging.info('Intersecting assembly with GWAS snps')
    args_int = ['bedtools', 'intersect', 
            '-a', args.gwas,
            '-b', assembly_flank,
            '-wa',
            '-wb']
    with open(intersect_file, 'w') as fileh:
        subprocess.call(args_int, stdout=fileh)
    #count number of SNPs caught
    snps = set()
    
    for line in open(intersect_file):
        line = line.strip().split('\t')
        rsID = line[RSIDCOL]
        snps.add(rsID)
    gwas_overlap = len(snps)

    #snp universe
    #do intersections for real data and report number of overlapping snps in snp universe 
    logging.info('Intersecting assembly with snp universe')
    args_int = ['bedtools', 'intersect', 
            '-a', args.snps,
            '-b', assembly_flank,
            '-wa',
            '-wb',
            '-sorted']
    with open(intersect_file, 'w') as fileh:
        subprocess.call(args_int, stdout=fileh)
    #count number of SNPs caught
    snps = set()
    for line in open(intersect_file):
        line = line.strip().split('\t')
        rsID = line[RSIDCOL]
        snps.add(rsID)
    snp_overlap = len(snps)
    frac_real = float(gwas_overlap)/snp_overlap
    logging.info('%d GWAS snps overlap compendia genes'  % gwas_overlap)
    logging.info('%d snps (from \"snp universe\") overlap compendia genes' % snp_overlap)
    logging.info('Frac: %f' % frac_real)
    
    
    #loop the shuffle to generate a distribution of nulls for number of snps hit by random intergenic genes
    pool = multiprocessing.Pool(args.proc)
    NUM_SHUFFS = int(args.shuffs)
    shuff_args = (args.snps,
                  args.gwas,
                  args.excl,
                  locus_intervals_file,
                  args.chrom,
                  args.gtf,
                  frac_real,
                  gwas_overlap,
                  snp_overlap,
                  NUM_SHUFFS,
                  prefix,
                  args.flank)
    tasks = []
    header = [
              'gwas_shuff',
              'snp_shuff',
              'frac_shuff',
              'gwas_real',
              'snp_real',
              'frac_real',
              'OR'
              ]
    print '\t'.join(header)
    for i in xrange(NUM_SHUFFS):
        tasks.append((i,) + shuff_args)
    result_iter = pool.imap_unordered(shuffle_imap, tasks)
    for line in result_iter:
        print line
    pool.close()
    pool.join()
    
    shutil.rmtree(prefix)
    
    return 0
def compare_assemblies(ref_gtf_file, test_gtf_file, output_dir):
    # output files
    if not os.path.exists(output_dir):
        logging.info('Creating output dir: %s' % (output_dir))
        os.makedirs(output_dir)
    # merge step
    merged_gtf_file = os.path.join(output_dir, "merged.gtf")
    merged_sorted_gtf_file = os.path.splitext(merged_gtf_file)[0] + ".srt.gtf"
    merge_done_file = os.path.join(output_dir, 'merged.done')
    sort_done_file = os.path.join(output_dir, 'sort.done')
    if not os.path.exists(merge_done_file):
        # merge and sort ref/test gtf files
        logging.info("Merging reference and test GTF files")
        # make temporary file to store merged ref/test gtf files
        with open(merged_gtf_file, "w") as fileh:
            logging.info("Adding reference GTF file")
            add_gtf_file(ref_gtf_file, fileh, is_ref=True)
            logging.info("Adding test GTF file")
            add_gtf_file(test_gtf_file, fileh, is_ref=False)
        open(merge_done_file, 'w').close()
    if not os.path.exists(sort_done_file):
        logging.info("Sorting merged GTF file")
        # create temp directory
        tmp_dir = os.path.join(output_dir, 'tmp')
        if not os.path.exists(tmp_dir):
            logging.debug("Creating tmp directory '%s'" % (tmp_dir))
            os.makedirs(tmp_dir)
        sort_gtf(merged_gtf_file, merged_sorted_gtf_file, tmp_dir=tmp_dir)
        # cleanup
        shutil.rmtree(tmp_dir)
        open(sort_done_file, 'w').close()
    # compare assemblies
    overlapping_gtf_file = os.path.join(output_dir, 'overlapping.gtf')
    intergenic_tmp_gtf_file = os.path.join(output_dir, 'intergenic.tmp.gtf')
    overlapping_file = os.path.join(output_dir, 'overlapping.tsv')
    overlapping_consensus_file = os.path.join(output_dir,
                                              'overlapping.consensus.tsv')
    overlapping_done_file = os.path.join(output_dir, 'overlapping.done')
    stats_file = os.path.join(output_dir, 'stats.txt')
    stats_obj = GlobalStats()
    num_intergenic = 0
    if not os.path.exists(overlapping_done_file):
        logging.info("Comparing assemblies")
        gtf_fileh = open(overlapping_gtf_file, 'w')
        tmp_gtf_fileh = open(intergenic_tmp_gtf_file, 'w')
        overlapping_fileh = open(overlapping_file, 'w')
        overlapping_consensus_fileh = open(overlapping_consensus_file, 'w')
        for locus_transcripts in parse_gtf(open(merged_sorted_gtf_file)):
            locus_chrom = locus_transcripts[0].chrom
            locus_start = locus_transcripts[0].start
            locus_end = max(t.end for t in locus_transcripts)
            logging.debug(
                "[LOCUS] %s:%d-%d %d transcripts" %
                (locus_chrom, locus_start, locus_end, len(locus_transcripts)))
            for t, match_stats in compare_locus(locus_transcripts):
                if len(match_stats) == 0:
                    # write intergenic transcripts to analyze separately
                    t.attrs['category'] = Category.to_str(Category.INTERGENIC)
                    for f in t.to_gtf_features(source='assembly'):
                        print >> tmp_gtf_fileh, str(f)
                    num_intergenic += 1
                else:
                    # get consensus match information
                    consensus_match = MatchStats.consensus(match_stats)
                    assert consensus_match is not None
                    t.attrs['category'] = consensus_match.category
                    # add gtf attributes and write
                    for f in t.to_gtf_features(source='assembly'):
                        consensus_match.add_gtf_attributes(f)
                        print >> gtf_fileh, str(f)
                    # tab-delimited text output
                    print >> overlapping_consensus_fileh, str(consensus_match)
                    for ms in match_stats:
                        print >> overlapping_fileh, str(ms)
            # compute global statistics
            stats_obj.compute(locus_transcripts)
        logging.info("Reporting global statistics")
        with open(stats_file, 'w') as f:
            print >> f, stats_obj.report()
        gtf_fileh.close()
        tmp_gtf_fileh.close()
        overlapping_fileh.close()
        overlapping_consensus_fileh.close()
        open(overlapping_done_file, 'w').close()
    # resolve intergenic transcripts
    intergenic_gtf_file = os.path.join(output_dir, 'intergenic.gtf')
    intergenic_file = os.path.join(output_dir, 'intergenic.tsv')
    intergenic_best_file = os.path.join(output_dir, 'intergenic.best.tsv')
    intergenic_done_file = os.path.join(output_dir, 'intergenic.done')
    if not os.path.exists(intergenic_done_file):
        logging.info("Building interval index")
        locus_trees = build_locus_trees(merged_sorted_gtf_file)
        logging.info('Finding nearest matches to intergenic transcripts')
        gtf_fileh = open(intergenic_gtf_file, 'w')
        intergenic_fileh = open(intergenic_file, 'w')
        intergenic_best_fileh = open(intergenic_best_file, 'w')
        for locus_transcripts in parse_gtf(open(intergenic_tmp_gtf_file)):
            for t in locus_transcripts:
                # find nearest transcripts
                nearest_transcripts = find_nearest_transcripts(
                    t.chrom, t.start, t.end, t.strand, locus_trees)
                match_stats = []
                best_match = None
                if len(nearest_transcripts) == 0:
                    best_match = MatchStats.from_transcript(t)
                    best_match.category = Category.to_str(Category.INTERGENIC)
                    match_stats.append(best_match)
                else:
                    for ref, category, dist in nearest_transcripts:
                        # create a match object
                        ms = MatchStats.from_transcript(t, ref)
                        ms.shared_same_strand_bp = 0
                        ms.shared_opp_strand_bp = 0
                        ms.shared_introns = 0
                        ms.shared_splicing = False
                        ms.category = Category.to_str(category)
                        ms.distance = dist
                        match_stats.append(ms)
                    # choose the consensus match
                    best_match = MatchStats.choose_best(match_stats)
                # add gtf attributes and write
                for f in t.to_gtf_features(source='assembly'):
                    best_match.add_gtf_attributes(f)
                    print >> gtf_fileh, str(f)
                # write tab-delimited data
                print >> intergenic_best_fileh, str(best_match)
                for ms in match_stats:
                    print >> intergenic_fileh, str(ms)
        gtf_fileh.close()
        intergenic_fileh.close()
        intergenic_best_fileh.close()
        open(intergenic_done_file, 'w').close()
    # merge overlapping and intergenic results
    logging.info('Merging results')
    metadata_file = os.path.join(output_dir, 'metadata.txt')
    metadata_consensus_file = os.path.join(output_dir,
                                           'metadata.consensus.txt')
    assembly_gtf_file = os.path.join(output_dir, 'assembly.cmp.gtf')
    combine_done_file = os.path.join(output_dir, 'done')
    if not os.path.exists(combine_done_file):
        filenames = [overlapping_file, intergenic_file]
        with open(metadata_file, 'w') as outfile:
            print >> outfile, '\t'.join(MatchStats.header_fields())
            for fname in filenames:
                with open(fname) as infile:
                    for line in infile:
                        outfile.write(line)
        filenames = [overlapping_consensus_file, intergenic_best_file]
        with open(metadata_consensus_file, 'w') as outfile:
            print >> outfile, '\t'.join(MatchStats.header_fields())
            for fname in filenames:
                with open(fname) as infile:
                    for line in infile:
                        outfile.write(line)
        filenames = [intergenic_gtf_file, overlapping_gtf_file]
        with open(assembly_gtf_file, 'w') as outfile:
            for fname in filenames:
                with open(fname) as infile:
                    for line in infile:
                        outfile.write(line)
        open(combine_done_file, 'w').close()
    # cleanup
    logging.info("Done")
def orf_analysis(gtf_file, genome_fasta_file, pfam_dir, output_dir,
                 min_orf_length, first_orf_only, num_processes):
    #
    # extract transcript DNA sequences, translate to protein, and
    # search for ORFs
    #
    logging.debug('Finding ORFs in transcript sequences')
    # output files
    tmp_dir = os.path.join(output_dir, 'tmp')
    if not os.path.exists(tmp_dir):
        logging.info("Creating tmp directory '%s'" % (tmp_dir))
        os.makedirs(tmp_dir)
    orf_bed_file = os.path.join(output_dir, 'transcript_orfs.bed')
    unique_orf_file = os.path.join(output_dir, 'unique_orfs.txt')
    unique_orf_bed_file = os.path.join(output_dir, 'unique_orfs.bed')
    orf_file = os.path.join(tmp_dir, 'transcript_orfs.no_ids.txt')
    sorted_orf_file = os.path.join(tmp_dir,
                                   'transcript_orfs.no_ids.sortbyorf.txt')
    sorted_orf_id_file = os.path.join(tmp_dir, 'transcript_orfs.sortbyorf.txt')
    signalp_file = os.path.join(output_dir, 'signalp.txt')
    pfam_file = os.path.join(output_dir, 'pfam.txt')
    merged_orf_id_file = os.path.join(output_dir,
                                      'transcript_orfs.sortbyorf.merged.txt')
    sorted_merged_orf_id_file = os.path.join(
        output_dir, 'transcript_orfs.sortbytranscript.merged.txt')
    # open output files
    orf_fileh = open(orf_file, 'w')
    orf_bed_fileh = open(orf_bed_file, 'w')
    # open genome fasta file
    ref_fa = pysam.Fastafile(genome_fasta_file)
    num_finished = 1
    for locus_transcripts in parse_gtf(open(gtf_file)):
        for t in locus_transcripts:
            if first_orf_only:
                # get first ORF
                orf = get_first_transcript_orf(t, ref_fa)
                if len(orf.seq) >= min_orf_length:
                    print >> orf_fileh, '\t'.join(orf.to_table())
                    print >> orf_bed_fileh, '\t'.join(orf.to_bed())
            else:
                # get all ORFs
                for orf in get_all_transcript_orfs(t, ref_fa, min_orf_length):
                    print >> orf_fileh, '\t'.join(orf.to_table())
                    print >> orf_bed_fileh, '\t'.join(orf.to_bed())
            if (num_finished % 10000) == 0:
                logging.debug('Processed %d transcripts' % (num_finished))
            num_finished += 1
    # cleanup
    orf_fileh.close()
    orf_bed_fileh.close()
    #
    # sort ORF table by ORF amino acid sequence to group identical ORFs
    # together
    #
    logging.debug('Sorting ORFs by amino acid sequence')

    def sort_by_seq(line):
        '''comparison function for batch_sort'''
        fields = line.strip().split('\t')
        return fields[ORFInfo.SEQ_COL_NUM]

    sort_tmp_dir = os.path.join(tmp_dir, 'sort_tmp')
    os.makedirs(sort_tmp_dir)
    batch_sort(input=orf_file,
               output=sorted_orf_file,
               key=sort_by_seq,
               buffer_size=SORT_BUFFER_SIZE,
               tempdirs=[sort_tmp_dir])
    shutil.rmtree(sort_tmp_dir)
    #
    # assign each ORF a unique id and write to FASTA file
    #
    logging.debug('Determining unique ORFs')
    orf_fasta_prefix = os.path.join(tmp_dir, 'orf')
    orf_fasta_files = []
    orf_fasta_sizes = []
    for i in xrange(num_processes):
        orf_fasta_files.append(open('%s%d.fasta' % (orf_fasta_prefix, i), 'w'))
        orf_fasta_sizes.append(0)
    orf_file_index = 0
    outfileh = open(sorted_orf_id_file, 'w')
    unique_orf_fileh = open(unique_orf_file, 'w')
    print >> unique_orf_fileh, '\t'.join([
        'orf_id', 'orf_length', 'total_occurrences',
        'unique_genomic_occurrences'
    ])
    unique_orf_bed_fileh = open(unique_orf_bed_file, 'w')
    with open(sorted_orf_file) as infileh:
        for orfs in group_unique_orfs(infileh):
            # write to master transcript/ORF table
            for orf in orfs:
                print >> outfileh, '\t'.join(orf.to_table())
            # write ORF to fasta file
            lines = to_fasta(orfs[0].orf_id, orfs[0].seq.strip('*'))
            print >> orf_fasta_files[orf_file_index], lines
            orf_fasta_sizes[orf_file_index] += 1
            # advance to next fasta file
            orf_file_index = (orf_file_index + 1) % (num_processes)
            # group by genomic position and write ORFs to BED file
            unique_genome_orfs = {}
            for orf in orfs:
                k = (orf.chrom, orf.strand, tuple(orf.exons))
                if k in unique_genome_orfs:
                    continue
                unique_genome_orfs[k] = orf
            for orf in unique_genome_orfs.itervalues():
                print >> unique_orf_bed_fileh, '\t'.join(orf.to_bed(
                    orf.orf_id))
            # write unique ORF to tab-delimited text file
            fields = [
                orfs[0].orf_id,
                len(orfs[0].seq),
                len(orfs),
                len(unique_genome_orfs)
            ]
            print >> unique_orf_fileh, '\t'.join(map(str, fields))
    # cleanup
    unique_orf_bed_fileh.close()
    outfileh.close()
    # get fasta files with lines written
    orf_fasta_file_names = []
    for i in xrange(len(orf_fasta_files)):
        orf_fasta_files[i].close()
        if orf_fasta_sizes[i] > 0:
            orf_fasta_file_names.append(orf_fasta_files[i].name)
    #
    # search FASTA file against signalp
    #
    logging.debug('Searching for signal peptides')
    retcode = run_signalp(orf_fasta_file_names, signalp_file, tmp_dir)
    if retcode != 0:
        logging.error('Error searching for signal peptides')
        return 1
    #
    # search FASTA file against Pfam
    #
    logging.error('Scanning for Pfam domains')
    retcode = run_pfam(orf_fasta_file_names, pfam_dir, pfam_file, tmp_dir)
    if retcode != 0:
        logging.error('Error running pfam_scan.pl')
    #
    # merge results from Pfam and signalp
    #
    logging.debug('Merging SignalP and Pfam results')
    merge_results(sorted_orf_id_file, signalp_file, pfam_file,
                  merged_orf_id_file)
    #
    # sort by transcript id
    #
    logging.debug('Sorting ORFs by transcript ID')

    def sort_by_transcript_id(line):
        return line.split('\t', 1)[0]

    sort_tmp_dir = os.path.join(tmp_dir, 'sort_tmp')
    os.makedirs(sort_tmp_dir)
    batch_sort(input=merged_orf_id_file,
               output=sorted_merged_orf_id_file,
               key=sort_by_transcript_id,
               buffer_size=SORT_BUFFER_SIZE,
               tempdirs=[sort_tmp_dir])
    shutil.rmtree(sort_tmp_dir)
    # cleanup
    ref_fa.close()
    return 0
def compare_assemblies(ref_gtf_file, test_gtf_file, output_dir): 
    # output files
    if not os.path.exists(output_dir):
        logging.info('Creating output dir: %s' % (output_dir))
        os.makedirs(output_dir)
    # merge step
    merged_gtf_file = os.path.join(output_dir, "merged.gtf")
    merged_sorted_gtf_file = os.path.splitext(merged_gtf_file)[0] + ".srt.gtf"
    merge_done_file = os.path.join(output_dir, 'merged.done')
    sort_done_file = os.path.join(output_dir, 'sort.done')
    if not os.path.exists(merge_done_file):
        # merge and sort ref/test gtf files
        logging.info("Merging reference and test GTF files")
        # make temporary file to store merged ref/test gtf files
        with open(merged_gtf_file, "w") as fileh:
            logging.info("Adding reference GTF file")
            add_gtf_file(ref_gtf_file, fileh, is_ref=True)
            logging.info("Adding test GTF file")
            add_gtf_file(test_gtf_file, fileh, is_ref=False)
        open(merge_done_file, 'w').close()
    if not os.path.exists(sort_done_file):        
        logging.info("Sorting merged GTF file")
        # create temp directory
        tmp_dir = os.path.join(output_dir, 'tmp')    
        if not os.path.exists(tmp_dir):
            logging.debug("Creating tmp directory '%s'" % (tmp_dir))
            os.makedirs(tmp_dir)
        sort_gtf(merged_gtf_file, merged_sorted_gtf_file, tmp_dir=tmp_dir)
        # cleanup
        shutil.rmtree(tmp_dir)
        open(sort_done_file, 'w').close()
    # compare assemblies
    overlapping_gtf_file = os.path.join(output_dir, 'overlapping.gtf')
    intergenic_tmp_gtf_file = os.path.join(output_dir, 'intergenic.tmp.gtf')
    overlapping_file = os.path.join(output_dir, 'overlapping.tsv')
    overlapping_consensus_file = os.path.join(output_dir, 'overlapping.consensus.tsv')
    overlapping_done_file = os.path.join(output_dir, 'overlapping.done')
    stats_file = os.path.join(output_dir, 'stats.txt')
    stats_obj = GlobalStats()
    num_intergenic = 0
    if not os.path.exists(overlapping_done_file):
        logging.info("Comparing assemblies")
        gtf_fileh = open(overlapping_gtf_file, 'w')
        tmp_gtf_fileh = open(intergenic_tmp_gtf_file, 'w')
        overlapping_fileh = open(overlapping_file, 'w')
        overlapping_consensus_fileh = open(overlapping_consensus_file, 'w')
        for locus_transcripts in parse_gtf(open(merged_sorted_gtf_file)):
            locus_chrom = locus_transcripts[0].chrom
            locus_start = locus_transcripts[0].start
            locus_end = max(t.end for t in locus_transcripts)
            logging.debug("[LOCUS] %s:%d-%d %d transcripts" % 
                          (locus_chrom, locus_start, locus_end, 
                           len(locus_transcripts)))
            for t, match_stats in compare_locus(locus_transcripts):
                if len(match_stats) == 0:
                    # write intergenic transcripts to analyze separately
                    t.attrs['category'] = Category.to_str(Category.INTERGENIC)
                    for f in t.to_gtf_features(source='assembly'):
                        print >>tmp_gtf_fileh, str(f)
                    num_intergenic += 1
                else:
                    # get consensus match information
                    consensus_match = MatchStats.consensus(match_stats)                    
                    assert consensus_match is not None
                    t.attrs['category'] = consensus_match.category
                    # add gtf attributes and write
                    for f in t.to_gtf_features(source='assembly'):
                        consensus_match.add_gtf_attributes(f)
                        print >>gtf_fileh, str(f)
                    # tab-delimited text output
                    print >>overlapping_consensus_fileh, str(consensus_match)
                    for ms in match_stats:
                        print >>overlapping_fileh, str(ms)
            # compute global statistics
            stats_obj.compute(locus_transcripts)
        logging.info("Reporting global statistics")
        with open(stats_file, 'w') as f:
            print >>f, stats_obj.report()
        gtf_fileh.close()
        tmp_gtf_fileh.close()
        overlapping_fileh.close()
        overlapping_consensus_fileh.close()
        open(overlapping_done_file, 'w').close()
    # resolve intergenic transcripts
    intergenic_gtf_file = os.path.join(output_dir, 'intergenic.gtf')
    intergenic_file = os.path.join(output_dir, 'intergenic.tsv')
    intergenic_best_file = os.path.join(output_dir, 'intergenic.best.tsv')
    intergenic_done_file = os.path.join(output_dir, 'intergenic.done')
    if not os.path.exists(intergenic_done_file):
        logging.info("Building interval index")
        locus_trees = build_locus_trees(merged_sorted_gtf_file)
        logging.info('Finding nearest matches to intergenic transcripts')
        gtf_fileh = open(intergenic_gtf_file, 'w')
        intergenic_fileh = open(intergenic_file, 'w')
        intergenic_best_fileh = open(intergenic_best_file, 'w')
        for locus_transcripts in parse_gtf(open(intergenic_tmp_gtf_file)):
            for t in locus_transcripts:
                # find nearest transcripts
                nearest_transcripts = find_nearest_transcripts(t.chrom, t.start, t.end, t.strand, locus_trees)
                match_stats = []
                best_match = None
                if len(nearest_transcripts) == 0:
                    best_match = MatchStats.from_transcript(t)
                    best_match.category = Category.to_str(Category.INTERGENIC)
                    match_stats.append(best_match)
                else:
                    for ref,category,dist in nearest_transcripts: 
                        # create a match object
                        ms = MatchStats.from_transcript(t, ref)
                        ms.shared_same_strand_bp = 0
                        ms.shared_opp_strand_bp = 0
                        ms.shared_introns = 0
                        ms.shared_splicing = False
                        ms.category = Category.to_str(category)
                        ms.distance = dist
                        match_stats.append(ms)
                    # choose the consensus match
                    best_match = MatchStats.choose_best(match_stats)
                # add gtf attributes and write
                for f in t.to_gtf_features(source='assembly'):
                    best_match.add_gtf_attributes(f)
                    print >>gtf_fileh, str(f)
                # write tab-delimited data
                print >>intergenic_best_fileh, str(best_match)
                for ms in match_stats:
                    print >>intergenic_fileh, str(ms)
        gtf_fileh.close()
        intergenic_fileh.close()
        intergenic_best_fileh.close()
        open(intergenic_done_file, 'w').close()
    # merge overlapping and intergenic results
    logging.info('Merging results')
    metadata_file = os.path.join(output_dir, 'metadata.txt')
    metadata_consensus_file = os.path.join(output_dir, 'metadata.consensus.txt')
    assembly_gtf_file = os.path.join(output_dir, 'assembly.cmp.gtf')
    combine_done_file = os.path.join(output_dir, 'done')
    if not os.path.exists(combine_done_file):
        filenames = [overlapping_file, intergenic_file]
        with open(metadata_file, 'w') as outfile:
            print >>outfile, '\t'.join(MatchStats.header_fields())
            for fname in filenames:
                with open(fname) as infile:
                    for line in infile:
                        outfile.write(line)
        filenames = [overlapping_consensus_file, intergenic_best_file]
        with open(metadata_consensus_file, 'w') as outfile:
            print >>outfile, '\t'.join(MatchStats.header_fields())
            for fname in filenames:
                with open(fname) as infile:
                    for line in infile:
                        outfile.write(line)
        filenames = [intergenic_gtf_file, overlapping_gtf_file]
        with open(assembly_gtf_file, 'w') as outfile:
            for fname in filenames:
                with open(fname) as infile:
                    for line in infile:
                        outfile.write(line)
        open(combine_done_file, 'w').close()
    # cleanup
    logging.info("Done")
def main():
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument('--rename', dest='rename', action='store_true')
    parser.add_argument('gtf_file')
    args = parser.parse_args()
    gtf_file = args.gtf_file
    rename = args.rename
    if not os.path.exists(gtf_file):
        parser.error("GTF file '%s' not found" % (gtf_file))
    # parse transcripts
    num_transcripts = 0
    # keep track of redundant gene/transcript counts
    gene_map = collections.defaultdict(lambda: {})
    transcript_map = collections.defaultdict(lambda: {})
    for transcripts in parse_gtf(open(gtf_file)):
        for t in transcripts:
            catstr = t.attrs['category']
            catint = Category.to_int(catstr)
            gene_type = t.attrs.get('gene_type', None)
            ref_gene_type = t.attrs['ref_gene_type']
            if catint == Category.SAME_STRAND:
                # impute gene type
                new_gene_type = ref_gene_type
            else:
                if gene_type == 'protein_coding':
                    # don't change protein coding genes
                    new_gene_type = gene_type
                elif t.length < 250:
                    # categorize small RNA separately
                    new_gene_type = 'misc_RNA'
                else:
                    if ref_gene_type == 'protein_coding':
                        # categorize based on overlap with reference
                        new_gene_type = PROTEIN_CATEGORY_MAP[catint]
                    else:
                        # reference is also non-coding
                        new_gene_type = 'lincRNA'
            # get gene category
            gene_category = GENCODE_CATEGORY_MAP[new_gene_type]
            new_gene_name = None
            if rename:
                # resolve upper/lower case issue with gene names from 
                # different databases
                ref_gene_name = t.attrs['ref_gene_name'].upper()
                # build new gene name                
                if ref_gene_name == 'NONE':
                    new_gene_name = str(t.attrs['source'])
                elif catint == Category.SAME_STRAND:
                    new_gene_name = str(ref_gene_name)
                else:
                    new_gene_name = '%s.%s' % (ref_gene_name, catstr)
                # gene name string is key to a dictionary that
                # associates each gene id with an integer number
                gene_id = t.attrs['gene_id']
                gene_dict = gene_map[new_gene_name]
                if gene_id not in gene_dict:
                    gene_num = len(gene_dict) + 1
                    gene_dict[gene_id] = gene_num
                else:
                    gene_num = gene_dict[gene_id]
                # gene id is also key to dict that associates each isoform
                # of gene with integer number
                t_id = t.attrs['transcript_id']
                t_dict = transcript_map[gene_id]
                if t_id not in t_dict:
                    t_num = len(t_dict) + 1
                    t_dict[t_id] = t_num
                else:
                    t_num = t_dict[t_id]
                # append gene/transcript integers to gene name
                new_gene_name = '%s.%d.%d' % (new_gene_name, gene_num, t_num)
            # write new attributes
            for f in t.to_gtf_features(source='assemblyline', score=1000):
                f.attrs['gene_type'] = new_gene_type
                f.attrs['gene_category'] = gene_category 
                if rename:
                    if 'gene_name' in f.attrs:
                        f.attrs['orig_gene_name'] = f.attrs['gene_name']
                    f.attrs['gene_name'] = new_gene_name        
                print str(f)
            num_transcripts += 1
    return 0
Example #43
0
def main():
    logging.basicConfig(level=logging.DEBUG,
                        format="%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    parser = argparse.ArgumentParser()
    parser.add_argument('gtf_file')
    parser.add_argument('excl_file')
    parser.add_argument('chrom_sizes')
    parser.add_argument("output_prefix")
    args = parser.parse_args()
    prefix = args.output_prefix
    excl_file = args.excl_file
    chrom_sizes_file = args.chrom_sizes
    gtf_file = args.gtf_file
    # check command line parameters
    if which('bedtools') is None:
        parser.error('bedtools binary not found in PATH')
    if not os.path.exists(chrom_sizes_file):
        parser.error('chrom sizes file %s not found' % (chrom_sizes_file))
    gene_intervals_file = prefix + '.gene_intervals.bed'
    gene_intervals_shuffled_file = prefix + '.gene_intervals.shuffle.bed'
    shuffled_gtf_file = prefix + '.shuffle.gtf'
    sorted_shuffled_gtf_file = prefix + '.shuffle.srt.gtf'
    logging.info('Parsing GTF file')
    with open(gene_intervals_file, 'w') as f:
        for locus_transcripts in parse_gtf(open(gtf_file)):
            # find borders of locus
            locus_chrom = locus_transcripts[0].chrom
            locus_start = min(t.start for t in locus_transcripts)
            locus_end = max(t.end for t in locus_transcripts)
            logging.debug("[LOCUS] %s:%d-%d %d transcripts" % 
                          (locus_chrom, locus_start, locus_end, 
                           len(locus_transcripts)))
            for g in get_gene_intervals(locus_transcripts):
                print >>f, '\t'.join(map(str, [g.chrom, g.start, g.end, g.gene_id]))    
    # randomly shuffle genes
    logging.info("Shuffling genes")
    args = ['bedtools', 'shuffle', 
            '-excl', excl_file,
            '-i', gene_intervals_file, 
            '-g', args.chrom_sizes]
    with open(gene_intervals_shuffled_file, 'w') as fileh:
        subprocess.call(args, stdout=fileh)
    # read new gene positions
    logging.info("Reading shuffled gene intervals")
    shuffle_gene_map = {}
    with open(gene_intervals_shuffled_file) as fileh:
        for line in fileh:
            fields = line.strip().split('\t')
            chrom = fields[0]
            start = int(fields[1])
            end = int(fields[2])
            gene_id = fields[3]
            shuffle_gene_map[gene_id] = (chrom, start, end)
    # reposition transcripts
    logging.info("Repositioning transcripts")
    with open(shuffled_gtf_file, 'w') as fileh:
        for locus_transcripts in parse_gtf(open(gtf_file)):
            # get original positions
            orig_gene_map = {}
            for g in get_gene_intervals(locus_transcripts):
                orig_gene_map[g.gene_id] = (g.chrom, g.start, g.end)
            for t in locus_transcripts:
                gene_id = t.attrs['gene_id']
                orig_chrom, orig_start, orig_end = orig_gene_map[gene_id]
                if gene_id not in shuffle_gene_map:
                    logging.warning('Gene %s [%s:%d-%d] could not be shuffled' % (gene_id, orig_chrom, orig_start, orig_end))
                    continue
                new_chrom, new_start, new_end = shuffle_gene_map[gene_id]
                # reposition transcript
                t.chrom = new_chrom
                t.start = new_start + (t.start - orig_start)
                t.end = new_start + (t.end - orig_start)
                for e in t.exons:
                    e.start = new_start + (e.start - orig_start)
                    e.end = new_start + (e.end - orig_start)
                
                fields = write_bed(t.chrom, t.attrs['transcript_id'], t.strand, 1000, t.exons)
                print '\t'.join(fields)                
                #for f in t.to_gtf_features(source='shuffle'):
                #    print >>fileh, str(f)
    logging.info("Sorting GTF file")
    sort_gtf(shuffled_gtf_file, sorted_shuffled_gtf_file)