def __init__(self, nib_fns=[], nib_dirs=[]): '''*nib_fns* is a list of paths to specific .nib files desired for the NibDB. *nib_dirs* is a list of paths to directories containing .nib files such that every .nib file in the directories is added to the NibDB. Explicitly passed files take precedence over those found in directories when sequence names collide. ''' SeqDB.__init__(self) # find all *.nib files in the directories passed if isinstance(nib_dirs, str): # user just provided single directory nib_dirs = [nib_dirs] dir_nibs = [] for d in nib_dirs: dir_nibs.extend(glob.glob(os.path.join(d, '*.nib'))) if isinstance(nib_fns, str): nib_fns = [nib_fns] # for each .nib found, add to db # if there is a collision of names, those specified in files (not dirs) # takes precedence without warning for fn in dir_nibs + nib_fns: # open the nib file nib_path, nib_fn, nib_base, nib_ext = get_file_parts(fn) fn, nib_f = _nib_fd(fn) self._db_map[nib_base] = nib_f # store some info self.db_info[nib_base]['path'] = fn nbases = validate_nib_file(self._db_map[nib_base]) self.db_info[nib_base]['nbases'] = nbases
def __init__(self,nib_fns=[],nib_dirs=[]) : '''*nib_fns* is a list of paths to specific .nib files desired for the NibDB. *nib_dirs* is a list of paths to directories containing .nib files such that every .nib file in the directories is added to the NibDB. Explicitly passed files take precedence over those found in directories when sequence names collide. ''' SeqDB.__init__(self) # find all *.nib files in the directories passed if isinstance(nib_dirs,str) : # user just provided single directory nib_dirs = [nib_dirs] dir_nibs = [] for d in nib_dirs : dir_nibs.extend(glob.glob(os.path.join(d,'*.nib'))) if isinstance(nib_fns,str) : nib_fns = [nib_fns] # for each .nib found, add to db # if there is a collision of names, those specified in files (not dirs) # takes precedence without warning for fn in dir_nibs+nib_fns : # open the nib file nib_path,nib_fn,nib_base,nib_ext = get_file_parts(fn) fn, nib_f = _nib_fd(fn) self._db_map[nib_base] = nib_f # store some info self.db_info[nib_base]['path'] = fn nbases = validate_nib_file(self._db_map[nib_base]) self.db_info[nib_base]['nbases'] = nbases
def get_nib_header_batch(nib, queries): '''Batch method for creating nibFrag headers. *queries* is a list of at most 6-tuples (start,end,strand,name,dbHeader,tbaHeader) representing queries as specified by the original nibFrag utility. Only start, end, and strand fields are required.''' nib_path, nib_f = _nib_fd(nib) nib_dir, nib_fn, nib_base, nib_ext = get_file_parts(nib_path) nbases = validate_nib_file(nib) headers = [] header_tmpl = '>%(name)s%(db)s\n' for rec in queries: # set some defaults if they are not supplied rec = list(rec) rec.extend([None] * (6 - len(rec))) start, end, strand, name, dbHeader, tbaHeader = rec if end == -1: end = nbases fields = {} fields['name'] = nib_path + ':%d-%d' % (start, end) if not name else name fields['db'] = '' if tbaHeader: # ignored for some reason in nibFrag when tbaHeader supplied and dbHeader is not fields['name'] = '' if not dbHeader else fields['name'] fields['db'] = '%s.%s:%d-%d of %d' % (tbaHeader, nib_base, start, end, nbases) if dbHeader: fields['db'] = ':%s.%s:%d-%d:%s:%d' % (dbHeader, nib_base, start, end, strand, nbases) headers.append(header_tmpl % fields) return headers
def get_nib_header_batch(nib,queries) : '''Batch method for creating nibFrag headers. *queries* is a list of at most 6-tuples (start,end,strand,name,dbHeader,tbaHeader) representing queries as specified by the original nibFrag utility. Only start, end, and strand fields are required.''' nib_path, nib_f = _nib_fd(nib) nib_dir,nib_fn,nib_base,nib_ext = get_file_parts(nib_path) nbases = validate_nib_file(nib) headers = [] header_tmpl = '>%(name)s%(db)s\n' for rec in queries : # set some defaults if they are not supplied rec = list(rec) rec.extend([None]*(6-len(rec))) start, end, strand, name, dbHeader, tbaHeader = rec if end == -1 : end = nbases fields = {} fields['name'] = nib_path+':%d-%d'%(start,end) if not name else name fields['db'] = '' if tbaHeader : # ignored for some reason in nibFrag when tbaHeader supplied and dbHeader is not fields['name'] = '' if not dbHeader else fields['name'] fields['db'] = '%s.%s:%d-%d of %d'%(tbaHeader,nib_base,start,end,nbases) if dbHeader : fields['db'] = ':%s.%s:%d-%d:%s:%d'%(dbHeader,nib_base,start,end,strand,nbases) headers.append(header_tmpl%fields) return headers
# parse command line arguments opts, args = parser.parse_args(sys.argv[1:]) if len(args) < 3 : parser.error('Must provide two non-option arguments') # filenames and paths organism, experiment_fn, control_fn = args[0:3] control_fn = None if len(args) > 3 : control_fn = args[2] org_settings = get_org_settings(organism) refseq_fn = org_settings['annotation_path'] exp_fpath,exp_fname,exp_fbase,exp_fext = get_file_parts(experiment_fn) exp_wrk_dir = os.path.abspath('.exp_%s_%s'%(exp_fbase,opts.exp_name)) if control_fn : cnt_fpath,cnt_fname,cnt_fbase,cnt_fext = get_file_parts(control_fn) cnt_wrk_dir = os.path.abspath('.cnt_%s_%s'%(cnt_fbase,opts.exp_name)) # the pipeline pipeline = Pypeline() steps = [] # split up files calls = ["mkdir %s"%exp_wrk_dir, "split_file.py %s --outdir=%s %s"%(opts.split_args,exp_wrk_dir,experiment_fn),] if control_fn :
if __name__ == '__main__' : opts,args = parser.parse_args(sys.argv[1:]) if len(args) == 0 : parser.print_usage() sys.exit(1) gerald_fns = args # step through the files for gerald_fn in gerald_fns : path,fn,fnbase,fnext = get_file_parts(gerald_fn) bed_lines = [] # where to write output to if opts.stdout : f_out = sys.stdout else : f_out = open(os.path.join(path,fnbase+'.bed'),'w') # process input gerald_d = DictReader(open(gerald_fn),fieldnames=GERALDOutput.FIELD_NAMES,delimiter='\t') for line_d in gerald_d : if (opts.pass_only and line_d['filtering'] == 'Y' and line_d['match_pos'] != '') or (not opts.pass_only and line_d['match_pos'] != '') : if opts.chromo_strip is not None :
if __name__ == '__main__' : opts, args = parser.parse_args(sys.argv[1:]) utility, filenames = args[0], args[1:] # try to find the utility abs_utility = os.path.abspath(utility) if not os.path.exists(abs_utility) : # look on the path abs_utility = Popen('which %s'%utility,shell=True,stdout=PIPE,stderr=PIPE).communicate()[0].strip() if not os.path.exists(abs_utility) : raise Exception("Utility %s could not be found in the local directory or on the user's path, exiting"%utility) sys.exit(1) upath,uname,ubase,uext = get_file_parts(abs_utility) runscript_tmpl = """ #!/bin/bash #$ -N %(jobname)s #$ -S /bin/sh #$ -o %(stdout)s #$ -e %(stderr)s #$ -cwd export PYTHONPATH=%(pythonpath)s:${PYTHONPATH} %(utility)s %(utilargs)s %(filename)s""" suffix = ubase if opts.suffix is None else opts.suffix for fn in filenames :
nibFrag_grp.add_option('--dbHeader',dest='dbHeader',default=None,help='Add full database info to the header, with or without -name option') nibFrag_grp.add_option('--tbaHeader',dest='tbaHeader',default=None,help='Format header for compatibility with tba, takes database name as argument') parser.add_option_group(nibFrag_grp) if __name__ == '__main__' : opts, args = parser.parse_args(sys.argv[1:]) if len(args) < 1 : parser.print_usage() parser.exit(1) # setup nib_path = args[0] nib_dir,nib_fn,nib_base,nib_ext = get_file_parts(nib_path) queries = [] if opts.batch : if len(args) < 2 : parser.error('Two arguments must be supplied in batch mode') batch_fns = args[1:] for fn in batch_fns : if opts.batch_format == 'BED' : for bed in BEDFile(fn) : if bed['chrom'] != nib_base : warnings.warn('Chromosome in BED line %s does not match file %s, skipping'%(bed['chrom'],nib_base)) else :
filter_str = filter_str.replace('>=','_GTE_') filter_str = filter_str.replace('<=','_LTE_') filter_str = filter_str.replace('>','_GT_') filter_str = filter_str.replace('<','_LT_') fn_str += '_%s'%filter_str if opts.top is not None : fn_str += '_top%d'%opts.top if len(opts.sort_by) != 0 : fn_str += '_sortby_%s'%opts.sort_by if opts.shuffle : fn_str += '_shuffled' macs_path,macs_fn,macs_basefn,macs_ext = get_file_parts(args[0]) encoded_fn = os.path.join(macs_path,macs_basefn+fn_str+macs_ext) if opts.print_encoded_fn : sys.stdout.write(encoded_fn) sys.exit(0) else : out_f = open(encoded_fn,'w') elif opts.output : out_f = open(opts.output,'w') else : out_f = sys.stdout # parse the filters field_filters = defaultdict(list) for filter in opts.filters : field, filter_cond = parse_filter(filter)
from optparse import OptionParser from chipsequtil import KnownGeneFile, get_file_parts #args = ['/nfs/genomes/mouse_gp_jul_07/anno/knownGene-2010-07-08.txt','/nfs/genomes/mouse_gp_jul_07/anno/kgXref-2010-07-08.txt'] args = ['/nfs/genomes/mouse_gp_jul_07/anno/knownGene-2010-08-03.gtf','/nfs/genomes/mouse_gp_jul_07/anno/kgXref-2010-07-08.txt'] usage = '%prog <knownGene annotation>' description = 'convert a UCSC knownGene annotation to GFF' parser = OptionParser(usage=usage,description=description) if __name__ == '__main__' : opts, args = parser.parse_args(args) kg_path,kg_fn,kg_base,kg_ext = get_file_parts(args[0]) #kg_f = KnownGeneFile(args[0]) # xref for finding gene symbols kgXref_fn = args[1] kgXref_fieldnames = ['kgID','mRNA','spID','spDisplayID','geneSymbol','refseq','proAcc','description'] xref_map = dict([(x['kgID'],x) for x in DictReader(open(kgXref_fn),delimiter='\t',fieldnames=kgXref_fieldnames)]) gff_headers = ['seqname','source','feature','start','end','score','strand','frame','attributes'] gff_reader = DictReader(open(args[0]),delimiter='\t',fieldnames=gff_headers) gff_writer = DictWriter(sys.stdout,delimiter='\t',fieldnames=gff_headers,quotechar='',quoting=QUOTE_NONE,lineterminator='\n') #gff_writer.writerow(dict([(x,x) for x in gff_headers])) for i,rec in enumerate(gff_reader) : #d = {} #d['seqname'] = rec['chrom']
else : other_args.append(arg) opts, args = parser.parse_args(wqsub_args) if len(other_args) == 0 : parser.error('Must provide a command') command = ' '.join(other_args) runscript_tmpl = templates[opts.drm] # set up job parameters cmd_exe = os.path.basename(other_args[0]) jobname = opts.wqsub_name+'_'+cmd_exe stdout_fn = jobname+opts.wqsub_ext stdout = os.path.abspath(stdout_fn) fpath,fname,fbase,fext = get_file_parts(stdout) stderr = os.path.abspath(os.path.join(jobname+'.err')) # get the user's current environment and put it into the execute script if opts.wqsub_no_env : env_str = '# local environment variables omitted' else : env_str = '#%s -V'%drm_symb[opts.drm] # construct the script addnl_params = [] for addnl in opts.drm_args : addnl_params.append('#%s %s'%(drm_symb[opts.drm],addnl)) addnl_params = '\n'.join(addnl_params) job_dict = {'jobname':fname,