def processChunk(contig, chunk, options, fasta=None): """ This function requires segments to be non-overlapping. """ if len(chunk) == 0: return # check whether there are overlapping features or not checked = [] for feature in chunk: checked.append(feature) others = [x for x in chunk if x not in checked] for otherFeature in others: if GTF.Overlap(feature, otherFeature): raise ValueError(" Histogram could not be created" " since the file contains overlapping " "features! \n%s\n%s " % (feature, otherFeature)) # clear auxiliary list del checked[:] # compute max_coordinate for the histogram max_coordinate = max(map(lambda x: x.end, chunk)) # compute window size if options.window_size: window_size = options.window_size num_bins = int(math.ceil((float(max_coordinate) / window_size))) elif options.num_bins and fasta: contig_length = fasta.getLength(contig) assert max_coordinate <= contig_length, ("maximum coordinate (%i) " "larger than contig size (%i)" " for contig %s" % (max_coordinate, contig_length, contig)) max_coordinate = contig_length window_size = int(math.floor(float(contig_length) / options.num_bins)) num_bins = options.num_bins else: raise ValueError("please specify a window size of provide " "genomic sequence with number of bins.") values = [[] for x in range(num_bins)] # do several parses for each feature, slow, but easier to code # alternatively: sort by feature and location. for feature in options.features: total = 0 bin = 0 end = window_size for entry in chunk: if entry.feature != feature: continue while end < entry.start: values[bin].append(total) bin += 1 end += window_size while entry.end > end: seg_start = max(entry.start, end - window_size) seg_end = min(entry.end, end) total += seg_end - seg_start values[bin].append(total) end += window_size bin += 1 else: seg_start = max(entry.start, end - window_size) seg_end = min(entry.end, end) total += seg_end - seg_start while bin < num_bins: values[bin].append(total) bin += 1 printValues(contig, max_coordinate, window_size, values, options)
def main(argv=None): """script main. parses command line options in sys.argv, unless *argv* is given. """ if argv == None: argv = sys.argv parser = E.OptionParser( version= "%prog version: $Id: diff_gff.py 2781 2009-09-10 11:33:14Z andreas $", usage=globals()["__doc__"]) parser.add_option("-e", "--write-equivalent", dest="write_equivalent", help="write equivalent entries [default=%default].", action="store_true") parser.add_option("-f", "--write-full", dest="write_full", help="write full gff entries [default=%default].", action="store_true") parser.add_option( "-o", "--format=", dest="format", help="output format [flat|multi-line] [default=%default]") parser.add_option("-p", "--add-percent", dest="add_percent", action="store_true", help="add percentage columns [default=%default].") parser.add_option( "-a", "--as-gtf", "--is-gtf", dest="as_gtf", action="store_true", help= "input is in gtf format. Output on overlapping genes will be output [default=%default]." ) parser.add_option("-s", "--ignore-strand", dest="ignore_strand", action="store_true", help="ignore strand information [default=%default].") parser.set_defaults( write_equivalent=False, write_full=False, format="flat", add_percent=False, ignore_strand=False, as_gtf=False, ) (options, args) = E.Start(parser, add_output_options=True) if len(args) != 2: raise ValueError("two arguments required") input_filename1, input_filename2 = args ## duplicated features cause a problem. Make sure ## features are non-overlapping by running ## gff_combine.py on GFF files first. E.info("reading data") if options.as_gtf: gff1 = GTF.readFromFile(IOTools.openFile(input_filename1, "r")) gff2 = GTF.readFromFile(IOTools.openFile(input_filename2, "r")) overlaps_genes = [] else: gff1 = GTF.readFromFile(IOTools.openFile(input_filename1, "r")) gff2 = GTF.readFromFile(IOTools.openFile(input_filename2, "r")) E.info("reading data finished: %i, %i" % (len(gff1), len(gff2))) # removing everything but exons gff1 = [x for x in gff1 if x.feature == "exon"] gff2 = [x for x in gff2 if x.feature == "exon"] E.info("after keeping only 'exons': %i, %i" % (len(gff1), len(gff2))) if options.ignore_strand: for e in gff1: e.strand = "." for e in gff2: e.strand = "." E.info("sorting exons") gff1.sort(key=lambda x: (x.contig, x.strand, x.start, x.end)) gff2.sort(key=lambda x: (x.contig, x.strand, x.start, x.end)) E.info("sorting exons finished") subtotals = [] subtotal = Counts(add_percent=options.add_percent) outfile_diff = getFile(options, "diff") outfile_overlap = getFile(options, "overlap") if options.as_gtf: overlapping_genes = [] else: overlapping_genes = None i1, i2 = 0, 0 n1 = len(gff1) n2 = len(gff2) first_entry2, first_entry1 = None, None while i1 < n1 and i2 < n2: entry1 = gff1[i1] entry2 = gff2[i2] E.debug("1: i1=%i n1=%i entry1=%s" % (i1, n1, str(entry1))) E.debug("2: i2=%i n2=%i entry2=%s" % (i2, n2, str(entry2))) ## when chromosome/strand have changed in both (and are the same), print summary info: if first_entry1: if (first_entry1.contig != entry1.contig or \ first_entry1.strand != entry1.strand) and \ (first_entry2.contig != entry2.contig or \ first_entry2.strand != entry2.strand) and \ entry1.contig == entry2.contig and \ entry1.strand == entry2.strand : subtotals.append( (first_entry1.contig, first_entry1.strand, subtotal)) subtotal = Counts(add_percent=options.add_percent) first_entry1 = entry1 first_entry2 = entry2 else: first_entry1 = entry1 first_entry2 = entry2 output_1, output_2 = None, None if GTF.Overlap(entry1, entry2): ## collect multiple matches last_l = True while GTF.Overlap(entry1, entry2): if overlapping_genes != None: overlapping_genes.append((entry1.gene_id, entry2.gene_id)) write_last = True subtotal.noverlap += 1 if entry1.start == entry2.start and entry1.end == entry2.end: symbol = "=" subtotal.nidentical += 1 elif entry1.start == entry2.start or entry1.end == entry2.end: symbol = "|" subtotal.nhalf += 1 else: symbol = "~" output_1 = entry1 output_2 = entry2 if entry1.end < entry2.end: i1 += 1 subtotal.nleft += 1 last_l = True if i1 >= n1: i2 += 1 break entry1 = gff1[i1] if GTF.Overlap(entry1, entry2): symbol = "/" # outfile.write( "# split right\n" ) subtotal.nsplit_right += 1 else: i2 += 1 subtotal.nright += 1 last_l = False if i2 >= n2: i1 += 1 break entry2 = gff2[i2] if GTF.Overlap(entry1, entry2): symbol = "\\" # outfile.write("# split left\n") subtotal.nsplit_left += 1 ## output at the end, so that symbol is known if options.write_equivalent: if options.format == "flat": outfile_overlap.write( "%s\t%s\t%s\n" % (symbol, str(output_1), str(output_2))) elif options.format == "multi-line": outfile_overlap.write( "%s\t%s\n\t%s\n" % (symbol, str(output_1), str(output_2))) write_last = False if write_last and output_1 and output_2 and options.write_equivalent: if options.format == "flat": outfile_overlap.write( "%s\t%s\t%s\n" % (symbol, str(output_1), str(output_2))) elif options.format == "multi-line": outfile_overlap.write( "%s\t%s\n\t%s\n" % (symbol, str(output_1), str(output_2))) ## if last advance was left, go right, and vice versa if last_l: i2 += 1 subtotal.nright += 1 else: i1 += 1 subtotal.nleft += 1 elif _cmp(entry1, entry2) < 0: outfile_diff.write("<\t%s\n" % str(entry1)) subtotal.nunique_left += 1 i1 += 1 subtotal.nleft += 1 elif _cmp(entry1, entry2) > 0: outfile_diff.write(">\t%s\n" % str(entry2)) subtotal.nunique_right += 1 i2 += 1 subtotal.nright += 1 while i1 < n1: outfile_diff.write("<\t%s\n" % str(entry1)) subtotal.nunique_left += 1 i1 += 1 if i1 >= n1: break entry1 = gff1[i1] subtotal.nleft += 1 while i2 < n2: outfile_diff.write(">\t%s\n" % str(entry2)) subtotal.nunique_right += 1 i2 += 1 if i2 >= n2: break entry2 = gff2[i2] subtotal.nright += 1 subtotals.append((entry1.contig, entry1.strand, subtotal)) if outfile_diff != options.stdout: outfile_diff.close() if outfile_overlap != options.stdout: outfile_overlap.close() ################################################################## ################################################################## ################################################################## ## print gene based information ################################################################## if overlapping_genes: outfile = getFile(options, "genes_ovl") s = set(overlapping_genes) outfile.write("gene_id1\tgene_id2\n") for a, b in s: outfile.write("%s\t%s\n" % (a, b)) if outfile != options.stdout: outfile.close() outfile_total = getFile(options, "genes_total") outfile_total.write( "set\tngenes\tnoverlapping\tpoverlapping\tnunique\tpunique\n") outfile = getFile(options, "genes_uniq1") a = set([x.gene_id for x in gff1]) b = set([x[0] for x in s]) d = a.difference(b) outfile.write("gene_id1\n") outfile.write("\n".join(d) + "\n") if outfile != options.stdout: outfile.close() outfile_total.write( "%s\t%i\t%i\t%5.2f\t%i\t%5.2f\n" % (os.path.basename(input_filename1), len(a), len(b), 100.0 * len(b) / len(a), len(d), 100.0 * len(d) / len(a))) outfile = getFile(options, "genes_uniq2") a = set([x.gene_id for x in gff2]) b = set([x[1] for x in s]) d = a.difference(b) outfile.write("gene_id2\n") outfile.write("\n".join(d) + "\n") if outfile != options.stdout: outfile.close() outfile_total.write( "%s\t%i\t%i\t%5.2f\t%i\t%5.2f\n" % (os.path.basename(input_filename2), len(a), len(b), 100.0 * len(b) / len(a), len(d), 100.0 * len(d) / len(a))) if outfile_total != options.stdout: outfile_total.close() ################################################################## ################################################################## ################################################################## ## print totals ################################################################## outfile = getFile(options, "total") outfile.write("chr\tstrand\t%s\n" % Counts(add_percent=options.add_percent).getHeader()) total = Counts(add_percent=options.add_percent) for x in subtotals: outfile.write("\t".join((x[0], x[1], str(x[2]))) + "\n") total += x[2] outfile.write("\t".join(("all", "all", str(total))) + "\n") if outfile != options.stdout: outfile.close() E.Stop()