def test_tabs2genetrack(self): "Testing bed2genetrack transformation" from genetrack.scripts import tabs2genetrack inpfile = conf.testdata('short-data.bed', verify=True) outfile = conf.tempdata('short-data.genetrack') tabs2genetrack.transform(inpfile, outfile, format='BED')
def transform(inpname, outname, format, shift=0, index=False, options=None): """ Transforms reads stored in bedfile to a genetrack input file. Requires at least 6 bed columns to access the strand. """ # detect file formats if format == "BED": CHROM, START, END, STRAND = 0, 1, 2, 5 elif format == "GFF": CHROM, START, END, STRAND = 0, 3, 4, 6 else: raise Exception('Invalid file format' % format) # two sanity checks, one day someone will thank me if format == 'BED' and inpname.endswith('gff'): raise Exception('BED format on a gff file?') if format == 'GFF' and inpname.endswith('bed'): raise Exception('GFF format on a bed file?') # find the basename of the outputname basename = os.path.basename(outname) # two files store intermediate results flat = conf.tempdata( '%s.flat' % basename ) sorted = conf.tempdata( '%s.sorted' % basename ) # check for track information on first line, # much faster this way than conditional checking on each line fp = file(inpname, 'rU') first = fp.readline() fp.close() # create the reader reader = csv.reader(file(inpname, 'rU'), delimiter='\t') # skip if trackline exists if first.startswith == 'track': reader.next() # unwind the comments list(takewhile(lambda x: x[0].startswith('#'), reader)) # copious timing info for those who enjoy these timer, full = util.Timer(), util.Timer() logger.debug("parsing '%s'" % inpname) logger.debug("output to '%s'" % outname) # create the unsorted output file and apply corrections logger.debug("unsorted flat file '%s'" % flat) fp = file(flat, 'wt') for linec, row in enumerate(reader): try: chrom, start, end, strand = row[CHROM], row[START], row[END], row[STRAND] except Exception, exc: first = row[0][0] # may be hitting the end of the file with other comments if first == '>': break # hit the sequence content of the gff file elif first == '#': continue # hit upon some comments else: logger.error(row) raise Exception(exc) if strand == '+': # on forward strand, 5' is at start idx = int(start) + shift fwd, rev, val = 1, 0, 1 elif strand == '-': # on reverse strand, 5' is at end idx = int(end) - shift fwd, rev, val = 0, 1, 1 else: # no strand specified, generate interval centers idx = (int(start)+int(end))/2 fwd, rev, val = 0, 0, 1 # it is essential be able to sort the index as a string! fp.write('%s\t%012d\t%s\t%s\t%s\n' % (chrom, idx, fwd, rev, val))
def bedfile(inpfile): basename = os.path.basename(inpfile) outfile = conf.tempdata('%s.genetrack' % basename) bed2genetrack.transform(inpfile, outfile)