def aggregateWindowsReadCounts(infiles, outfile, regex="(.*)\..*"): '''aggregate several results from coverageBed into a single file. *regex* is used to extract the track name from the filename. The default removes any suffix. coverageBed outputs the following columns: 1 Contig 2 Start 3 Stop 4 Name 5 The number of features in A that overlapped (by at least one base pair) the B interval. 6 The number of bases in B that had non-zero coverage from features in A. 7 The length of the entry in B. 8 The fraction of bases in B that had non-zero coverage from features in A. For bed: use column 5 For bed6: use column 7 For bed12: use column 13 Windows without any counts will not be output. ''' # get bed format bed_columns = Bed.getNumColumns(infiles[0]) # +1 as awk is 1-based column = bed_columns - 4 + 1 src = " ".join([ '''<( zcat %s | awk '{printf("%%s:%%i-%%i\\t%%i\\n", $1,$2,$3,$%s );}' ) ''' % (x, column) for x in infiles ]) tmpfile = P.getTempFilename(".") statement = '''paste %(src)s > %(tmpfile)s''' P.run() # build track names tracks = [ re.search(regex, os.path.basename(x)).groups()[0] for x in infiles ] outf = IOTools.openFile(outfile, "w") outf.write("interval_id\t%s\n" % "\t".join(tracks)) for line in open(tmpfile, "r"): data = line[:-1].split("\t") genes = list(set([data[x] for x in range(0, len(data), 2)])) values = [int(data[x]) for x in range(1, len(data), 2)] if sum(values) == 0: continue assert len(genes) == 1, \ "paste command failed, wrong number of genes per line: '%s'" % line outf.write("%s\t%s\n" % (genes[0], "\t".join(map(str, values)))) outf.close() os.unlink(tmpfile)
def aggregateWindowsReadCounts(infiles, outfile, regex="(.*)\..*"): '''aggregate several results from coverageBed into a single file. *regex* is used to extract the track name from the filename. The default removes any suffix. coverageBed outputs the following columns: 1 Contig 2 Start 3 Stop 4 Name 5 The number of features in A that overlapped (by at least one base pair) the B interval. 6 The number of bases in B that had non-zero coverage from features in A. 7 The length of the entry in B. 8 The fraction of bases in B that had non-zero coverage from features in A. For bed: use column 5 For bed6: use column 7 For bed12: use column 13 Windows without any counts will not be output. ''' # get bed format bed_columns = Bed.getNumColumns(infiles[0]) # +1 as awk is 1-based column = bed_columns - 4 + 1 src = " ".join(['''<( zcat %s | awk '{printf("%%s:%%i-%%i\\t%%i\\n", $1,$2,$3,$%s );}' ) ''' % (x, column) for x in infiles]) tmpfile = P.getTempFilename(".") statement = '''paste %(src)s > %(tmpfile)s''' P.run() # build track names tracks = [re.search(regex, os.path.basename(x)).groups()[0] for x in infiles] outf = IOTools.openFile(outfile, "w") outf.write("interval_id\t%s\n" % "\t".join(tracks)) for line in open(tmpfile, "r"): data = line[:-1].split("\t") genes = list(set([data[x] for x in range(0, len(data), 2)])) values = [int(data[x]) for x in range(1, len(data), 2)] if sum(values) == 0: continue assert len(genes) == 1, \ "paste command failed, wrong number of genes per line: '%s'" % line outf.write("%s\t%s\n" % (genes[0], "\t".join(map(str, values)))) outf.close() os.unlink(tmpfile)
def aggregateWindowsReadCounts(infiles, outfile): '''aggregate tag counts for each window. coverageBed outputs the following columns: 1) Contig 2) Start 3) Stop 4) Name 5) The number of features in A that overlapped (by at least one base pair) the B interval. 6) The number of bases in B that had non-zero coverage from features in A. 7) The length of the entry in B. 8) The fraction of bases in B that had non-zero coverage from features in A. For bed: use column 5 For bed6: use column 7 For bed12: use column 13 Tiles with no counts will not be output. ''' to_cluster = True # get bed format bed_columns = Bed.getNumColumns(infiles[0]) # +1 as awk is 1-based column = bed_columns - 4 + 1 src = " ".join([ '''<( zcat %s | awk '{printf("%%s:%%i-%%i\\t%%i\\n", $1,$2,$3,$%s );}' ) ''' % (x, column) for x in infiles ]) tmpfile = P.getTempFilename(".") statement = '''paste %(src)s > %(tmpfile)s''' P.run() tracks = [re.sub("\..*", '', os.path.basename(x)) for x in infiles] outf = IOTools.openFile(outfile, "w") outf.write("interval_id\t%s\n" % "\t".join(tracks)) for line in open(tmpfile, "r"): data = line[:-1].split("\t") genes = list(set([data[x] for x in range(0, len(data), 2)])) values = [int(data[x]) for x in range(1, len(data), 2)] if sum(values) == 0: continue assert len( genes ) == 1, "paste command failed, wrong number of genes per line: '%s'" % line outf.write("%s\t%s\n" % (genes[0], "\t".join(map(str, values)))) outf.close() os.unlink(tmpfile)
def aggregateWindowsReadCounts( infiles, outfile ): '''aggregate tag counts for each window. coverageBed outputs the following columns: 1) Contig 2) Start 3) Stop 4) Name 5) The number of features in A that overlapped (by at least one base pair) the B interval. 6) The number of bases in B that had non-zero coverage from features in A. 7) The length of the entry in B. 8) The fraction of bases in B that had non-zero coverage from features in A. For bed: use column 5 For bed6: use column 7 For bed12: use column 13 Tiles with no counts will not be output. ''' to_cluster = True # get bed format bed_columns = Bed.getNumColumns( infiles[0] ) # +1 as awk is 1-based column = bed_columns - 4 + 1 src = " ".join( [ '''<( zcat %s | awk '{printf("%%s:%%i-%%i\\t%%i\\n", $1,$2,$3,$%s );}' ) ''' % (x,column) for x in infiles] ) tmpfile = P.getTempFilename( "." ) statement = '''paste %(src)s > %(tmpfile)s''' P.run() tracks = [ re.sub( "\..*", '', os.path.basename(x) ) for x in infiles ] outf = IOTools.openFile( outfile, "w") outf.write( "interval_id\t%s\n" % "\t".join( tracks ) ) for line in open( tmpfile, "r" ): data = line[:-1].split("\t") genes = list(set([ data[x] for x in range(0,len(data), 2 ) ])) values = [ int(data[x]) for x in range(1,len(data), 2 ) ] if sum(values) == 0: continue assert len(genes) == 1, "paste command failed, wrong number of genes per line: '%s'" % line outf.write( "%s\t%s\n" % (genes[0], "\t".join(map(str, values) ) ) ) outf.close() os.unlink(tmpfile)
def aggregateWindowsTagCounts(infiles, outfile, regex="(.*)\..*"): '''aggregate output from several ``bedtools coverage`` results. ``bedtools coverage`` outputs the following columns for a bed4 file:: 1 Contig 2 Start 3 Stop 4 Name 5 The number of features in A that overlapped (by at least one base pair) the B interval. 6 The number of bases in B that had non-zero coverage from features in A. 7 The length of the entry in B. 8 The fraction of bases in B that had non-zero coverage from features in A. This method autodetects the number of columns in the :term:`infiles` and selects: * bed4: use column 5 * bed6: use column 7 * bed12: use column 13 Arguments --------- infiles : list Input filenames with the output from ``bedtools coverage`` outfile : string Output filename in :term:`tsv` format. regex : string Regular expression used to extract the track name from the filename. The default removes any suffix. ''' # get bed format bed_columns = Bed.getNumColumns(infiles[0]) # +1 as awk is 1-based column = bed_columns - 4 + 1 src = " ".join([ """<( zcat %s | awk '{printf("%%s:%%i-%%i\\t%%i\\n", $1,$2,$3,$%s );}')""" % (x, column) for x in infiles ]) tmpfile = P.get_temp_filename(".") statement = '''paste %(src)s > %(tmpfile)s''' P.run(statement) # build track names tracks = [ re.search(regex, os.path.basename(x)).groups()[0] for x in infiles ] outf = IOTools.open_file(outfile, "w") outf.write("interval_id\t%s\n" % "\t".join(tracks)) # filter for uniqueness - keys with the same value as the # previous line will be ignored. last_gene = None c = E.Counter() for line in open(tmpfile, "r"): c.input += 1 data = line[:-1].split("\t") genes = list(set([data[x] for x in range(0, len(data), 2)])) values = [int(data[x]) for x in range(1, len(data), 2)] assert len(genes) == 1, \ "paste command failed, wrong number of genes per line: '%s'" % line if genes[0] == last_gene: c.duplicates += 1 continue c.output += 1 outf.write("%s\t%s\n" % (genes[0], "\t".join(map(str, values)))) last_gene = genes[0] outf.close() os.unlink(tmpfile) E.info("aggregateWindowsTagCounts: %s" % c)
def aggregateWindowsTagCounts(infiles, outfile, regex="(.*)\..*"): '''aggregate output from several ``bedtools coverage`` results. ``bedtools coverage`` outputs the following columns for a bed4 file:: 1 Contig 2 Start 3 Stop 4 Name 5 The number of features in A that overlapped (by at least one base pair) the B interval. 6 The number of bases in B that had non-zero coverage from features in A. 7 The length of the entry in B. 8 The fraction of bases in B that had non-zero coverage from features in A. This method autodetects the number of columns in the :term:`infiles` and selects: * bed4: use column 5 * bed6: use column 7 * bed12: use column 13 Arguments --------- infiles : list Input filenames with the output from ``bedtools coverage`` outfile : string Output filename in :term:`tsv` format. regex : string Regular expression used to extract the track name from the filename. The default removes any suffix. ''' # get bed format bed_columns = Bed.getNumColumns(infiles[0]) # +1 as awk is 1-based column = bed_columns - 4 + 1 src = " ".join(["""<( zcat %s | awk '{printf("%%s:%%i-%%i\\t%%i\\n", $1,$2,$3,$%s );}')""" % (x, column) for x in infiles]) tmpfile = P.getTempFilename(".") statement = '''paste %(src)s > %(tmpfile)s''' P.run() # build track names tracks = [re.search(regex, os.path.basename(x)).groups()[0] for x in infiles] outf = IOTools.openFile(outfile, "w") outf.write("interval_id\t%s\n" % "\t".join(tracks)) # filter for uniqueness - keys with the same value as the # previous line will be ignored. last_gene = None c = E.Counter() for line in open(tmpfile, "r"): c.input += 1 data = line[:-1].split("\t") genes = list(set([data[x] for x in range(0, len(data), 2)])) values = [int(data[x]) for x in range(1, len(data), 2)] assert len(genes) == 1, \ "paste command failed, wrong number of genes per line: '%s'" % line if genes[0] == last_gene: c.duplicates += 1 continue c.output += 1 outf.write("%s\t%s\n" % (genes[0], "\t".join(map(str, values)))) last_gene = genes[0] outf.close() os.unlink(tmpfile) E.info("aggregateWindowsTagCounts: %s" % c)