def check_options(opts): # check resume if not path.exists(opts.workdir): warn('ERROR: workdir not found, creating it') mkdir(opts.workdir) # write version log vlog_path = path.join(opts.workdir, 'TADbit_and_dependencies_versions.log') dependencies = get_dependencies_version() if not path.exists( vlog_path) or open(vlog_path).readlines() != dependencies: print('Writing versions of TADbit and dependencies') vlog = open(vlog_path, 'w') vlog.write(dependencies) vlog.close() # do the division to bins if opts.job_list is not None: if opts.job_list == []: opts.job_list = [ 'maxdist', 'upfreq', 'lowfreq', 'scale', 'dcutoff' ] try: opts.ori_beg = opts.beg opts.ori_end = opts.end opts.beg = int(float(opts.beg) / opts.reso) opts.end = int(float(opts.end) / opts.reso) if opts.end - opts.beg <= 2: raise Exception('"beg" and "end" parameter should be given in ' + 'genomic coordinates, not bin') except TypeError: pass # turn options into lists def _load_range(range_str, num=float): try: beg, end, step = map(num, range_str[0].split(':')) return tuple(arange(beg, end + step, step)) except (AttributeError, ValueError): return tuple([num(v) for v in range_str]) opts.scale = _load_range(opts.scale) opts.maxdist = _load_range(opts.maxdist, num=int) opts.upfreq = _load_range(opts.upfreq) opts.lowfreq = _load_range(opts.lowfreq) opts.dcutoff = _load_range(opts.dcutoff) opts.nmodels_run = opts.nmodels_run or opts.nmodels if opts.matrix: opts.matrix = path.abspath(opts.matrix) opts.workdir = path.abspath(opts.workdir) mkdir(opts.workdir) if 'tmpdb' in opts and opts.tmpdb: dbdir = opts.tmpdb # tmp file dbfile = 'trace_%s' % (''.join( [ascii_letters[int(random() * 52)] for _ in range(10)])) opts.tmpdb = path.join(dbdir, dbfile)
def check_options(opts): if not opts.workdir: raise Exception('ERROR: output option required.') if opts.type != 'map': raise NotImplementedError('ERROR: not yet there') if not opts.genome: raise Exception('ERROR: genome parameter required.') if not opts.workdir: raise Exception('ERROR: workdir parameter required.') # check skip if not path.exists(opts.workdir) and opts.skip: print ('WARNING: can use output files, found, not skipping...') opts.skip = False if opts.workdir.endswith('/'): opts.workdir = opts.workdir[:-1] # write log log_format = '[PARSING] %(message)s' # reset logging logging.getLogger().handlers = [] try: print 'Writting log to ' + path.join(opts.workdir, 'process.log') logging.basicConfig(level=logging.INFO, format=log_format, filename=path.join(opts.workdir, 'process.log'), filemode='aw') except IOError: logging.basicConfig(level=logging.DEBUG, format=log_format, filename=path.join(opts.workdir, 'process.log2'), filemode='aw') # to display log on stdout also logging.getLogger().addHandler(logging.StreamHandler()) # write version log vlog_path = path.join(opts.workdir, 'TADbit_and_dependencies_versions.log') dependencies = get_dependencies_version() if not path.exists(vlog_path) or open(vlog_path).readlines() != dependencies: logging.info('Writting versions of TADbit and dependencies') vlog = open(vlog_path, 'w') vlog.write(dependencies) vlog.close() # check if job already run using md5 digestion of parameters if already_run(opts): exit('WARNING: exact same job already computed, see JOBs table above')
def check_options(opts): if not path.exists(opts.workdir): mkdir(opts.workdir) # write version log vlog_path = path.join(opts.workdir, 'TADbit_and_dependencies_versions.log') dependencies = get_dependencies_version() if not path.exists( vlog_path) or open(vlog_path).readlines() != dependencies: logging.info('Writing versions of TADbit and dependencies') vlog = open(vlog_path, 'w') vlog.write(dependencies) vlog.close() mkdir(path.join(opts.workdir, '03_filtered_reads')) # create empty DB if don't exists dbpath = path.join(opts.workdir, 'trace.db') open(dbpath, 'a').close() # for LUSTRE file system.... if 'tmpdb' in opts and opts.tmpdb: dbdir = opts.tmpdb # tmp file dbfile = 'trace_%s' % (''.join( [ascii_letters[int(random() * 52)] for _ in range(10)])) opts.tmpdb = path.join(dbdir, dbfile) try: copyfile(path.join(opts.workdir, 'trace.db'), opts.tmpdb) except IOError: pass # number of cpus if opts.cpus == 0: opts.cpus = cpu_count() else: opts.cpus = min(opts.cpus, cpu_count())
def check_options(opts): if not opts.workdir: raise Exception('ERROR: output option required.') if opts.type != 'map': raise NotImplementedError('ERROR: not yet there') if not opts.genome: raise Exception('ERROR: genome parameter required.') if not opts.workdir: raise Exception('ERROR: workdir parameter required.') # check skip if not path.exists(opts.workdir) and opts.skip: print('WARNING: can use output files, found, not skipping...') opts.skip = False if opts.workdir.endswith('/'): opts.workdir = opts.workdir[:-1] # write log newbie = False if not path.exists(opts.workdir): newbie = True mkdir(opts.workdir) log_format = '[PARSING] %(message)s' # reset logging logging.getLogger().handlers = [] try: print('Writing log to ' + path.join(opts.workdir, 'process.log')) logging.basicConfig(level=logging.INFO, format=log_format, filename=path.join(opts.workdir, 'process.log'), filemode='a+') except IOError: logging.basicConfig(level=logging.DEBUG, format=log_format, filename=path.join(opts.workdir, 'process.log2'), filemode='a+') # to display log on stdout also logging.getLogger().addHandler(logging.StreamHandler()) # write version log vlog_path = path.join(opts.workdir, 'TADbit_and_dependencies_versions.log') dependencies = get_dependencies_version() if not path.exists( vlog_path) or open(vlog_path).readlines() != dependencies: logging.info('Writing versions of TADbit and dependencies') vlog = open(vlog_path, 'w') vlog.write(dependencies) vlog.close() # for lustre file system.... if 'tmpdb' in opts and opts.tmpdb: dbdir = opts.tmpdb # tmp file dbfile = 'trace_%s' % (''.join( [ascii_letters[int(random() * 52)] for _ in range(10)])) opts.tmpdb = path.join(dbdir, dbfile) try: copyfile(path.join(opts.workdir, 'trace.db'), opts.tmpdb) except IOError: pass # check if job already run using md5 digestion of parameters try: if already_run(opts): if 'tmpdb' in opts and opts.tmpdb: remove(path.join(dbdir, dbfile)) exit( 'WARNING: exact same job already computed, see JOBs table above' ) except OSError: pass
def check_options(opts): if not opts.mapper_binary: if opts.mapper == 'gem': opts.mapper_binary = 'gem-mapper' else: opts.mapper_binary = opts.mapper opts.mapper_binary = which(opts.mapper_binary) if not opts.mapper_binary: raise Exception( '\n\nERROR: Mapper binary not found, for GEM install it from:' '\nhttps://sourceforge.net/projects/gemlibrary/files/gem-library/Binary%20pre-release%202/' '\n - Download the GEM-binaries-Linux-x86_64-core_i3 if' 'have a recent computer, the ' 'GEM-binaries-Linux-x86_64-core_2 otherwise\n - ' 'Uncompress with "tar xjvf GEM-binaries-xxx.tbz2"\n - ' 'Copy the binary gem-mapper to /usr/local/bin/ for ' 'example (somewhere in your PATH).\n\nNOTE: GEM does ' 'not provide any binary for MAC-OS.') opts.gem_version = 0 if opts.mapper == 'gem': opts.gem_version = None try: out, _ = Popen([opts.mapper_binary, '--version'], stdout=PIPE, stderr=STDOUT, universal_newlines=True).communicate() opts.gem_version = int(out[1]) except ValueError as e: opts.gem_version = 2 print('Falling to gem v2') if opts.fast_fragment: if opts.gem_version < 3: raise Exception('ERROR: Fast fragment mapping needs GEM v3') if not opts.fastq2 or not path.exists(opts.fastq2): raise Exception( 'ERROR: Fast fragment mapping needs both fastq files. ' 'Please specify --fastq2') if opts.read != 0: raise Exception( 'ERROR: Fast fragment mapping needs to be specified with --read 0' ) if not opts.genome: raise Exception('ERROR: Fast fragment mapping needs ' 'the genome parameter.') # check RE name if opts.renz == ['CHECK']: print('\nSearching for most probable restriction enzyme in file: %s' % (opts.fastq)) try: pat, enz, pv = identify_re(opts.fastq, nreads=100000) print(' -> Most probable digested site: %s (pv: %f)' % (pat, pv)) print(' -> Enzymes matching: %s' % (', '.join(enz))) except ValueError: print(' -> Nothing found...') exit() for n, renz in enumerate(opts.renz): if renz == 'NONE': opts.renz[n] = None continue try: _ = RESTRICTION_ENZYMES[renz] except KeyError: print('\n\nERROR: restriction enzyme %s not found.' % (renz) + 'Use one of:\n\n' + ' '.join(sorted(RESTRICTION_ENZYMES)) + '\n\n') raise KeyError() except AttributeError: pass # check skip if not path.exists(opts.workdir) and opts.skip: print('WARNING: can use output files, found, not skipping...') opts.skip = False # number of cpus if opts.cpus == 0: opts.cpus = cpu_count() else: opts.cpus = min(opts.cpus, cpu_count()) # check paths if opts.mapper == 'gem' and not path.exists(opts.index): raise IOError('ERROR: index file not found at ' + opts.index) if not path.exists(opts.fastq): raise IOError('ERROR: FASTQ file not found at ' + opts.fastq) if not is_fastq(opts.fastq): raise IOError( ('ERROR: FASTQ file %s wrong format, check') % (opts.fastq)) try: opts.windows = [[int(i) for i in win.split(':')] for win in opts.windows] except TypeError: pass mkdir(opts.workdir) # write log # if opts.mapping_only: log_format = '[MAPPING {} READ{}] %(message)s'.format( opts.fastq, opts.read) # else: # log_format = '[DEFAULT] %(message)s' # reset logging logging.getLogger().handlers = [] try: print('Writing log to ' + path.join(opts.workdir, 'process.log')) logging.basicConfig(level=logging.INFO, format=log_format, filename=path.join(opts.workdir, 'process.log'), filemode='a+') except IOError: logging.basicConfig(level=logging.DEBUG, format=log_format, filename=path.join(opts.workdir, 'process.log2'), filemode='a+') # to display log on stdout also logging.getLogger().addHandler(logging.StreamHandler()) # write version log vlog_path = path.join(opts.workdir, 'TADbit_and_dependencies_versions.log') dependencies = get_dependencies_version() if not path.exists( vlog_path) or open(vlog_path).readlines() != dependencies: logging.info('Writing versions of TADbit and dependencies') vlog = open(vlog_path, 'w') vlog.write(dependencies) vlog.close() # check mapper extra options if opts.mapper_param: if (len(opts.mapper_param) == 1 and ('-' in opts.mapper_param[0] or '--' in opts.mapper_param[0])): # Single string surrounded by quotes opts.mapper_param = opts.mapper_param[0].split() else: opts.mapper_param = dict([o.split(':') for o in opts.mapper_param]) else: opts.mapper_param = {} if opts.mapper == 'gem' and opts.gem_version < 3: gem_valid_option = set([ "granularity", "q", "quality-format", "gem-quality-threshold", "mismatch-alphabet", "m", "e", "min-matched-bases", "max-big-indel-length", "s", "strata-after-best", "fast-mapping", "unique-mapping", "d", "D", "allow-incomplete-strata", "max-decoded-matches", "min-decoded-strata", "p", "paired-end-alignment", "b", "map-both-ends", "min-insert-size", "max-insert-size", "E", "max-extendable-matches", "max-extensions-per-match", "unique-pairing" ]) for k in opts.mapper_param: if not k in gem_valid_option: raise NotImplementedError( ('ERROR: option "%s" not a valid GEM option' 'or not suported by this tool.') % k) # create empty DB if don't exists dbpath = path.join(opts.workdir, 'trace.db') open(dbpath, 'a').close() # for lustre file system.... if 'tmpdb' in opts and opts.tmpdb: dbdir = opts.tmpdb # tmp file dbfile = 'trace_%s' % (''.join( [ascii_letters[int(random() * 52)] for _ in range(10)])) opts.tmpdb = path.join(dbdir, dbfile) try: copyfile(path.join(opts.workdir, 'trace.db'), opts.tmpdb) except IOError: pass # check if job already run using md5 digestion of parameters if already_run(opts): if 'tmpdb' in opts and opts.tmpdb: remove(path.join(dbdir, dbfile)) exit('WARNING: exact same job already computed, see JOBs table above')
def check_options(opts): if opts.cfg: get_options_from_cfg(opts.cfg, opts) opts.gem_binary = which(opts.gem_binary) if not opts.gem_binary: raise Exception('\n\nERROR: GEM binary not found, install it from:' '\nhttps://sourceforge.net/projects/gemlibrary/files/gem-library/Binary%20pre-release%202/' '\n - Download the GEM-binaries-Linux-x86_64-core_i3 if' 'have a recent computer, the ' 'GEM-binaries-Linux-x86_64-core_2 otherwise\n - ' 'Uncompress with "tar xjvf GEM-binaries-xxx.tbz2"\n - ' 'Copy the binary gem-mapper to /usr/local/bin/ for ' 'example (somewhere in your PATH).\n\nNOTE: GEM does ' 'not provide any binary for MAC-OS.') # check RE name try: _ = RESTRICTION_ENZYMES[opts.renz] except KeyError: print ('\n\nERROR: restriction enzyme not found. Use one of:\n\n' + ' '.join(sorted(RESTRICTION_ENZYMES)) + '\n\n') raise KeyError() except AttributeError: pass # check skip if not path.exists(opts.workdir) and opts.skip: print ('WARNING: can use output files, found, not skipping...') opts.skip = False # number of cpus if opts.cpus == 0: opts.cpus = cpu_count() else: opts.cpus = min(opts.cpus, cpu_count()) # check paths if not path.exists(opts.index): raise IOError('ERROR: index file not found at ' + opts.index) if not path.exists(opts.fastq): raise IOError('ERROR: FASTQ file not found at ' + opts.fastq) # create tmp directory if not opts.tmp: opts.tmp = opts.workdir + '_tmp_r%d' % opts.read try: opts.windows = [[int(i) for i in win.split(':')] for win in opts.windows] except TypeError: pass mkdir(opts.workdir) # write log # if opts.mapping_only: log_format = '[MAPPING {} READ{}] %(message)s'.format(opts.fastq, opts.read) # else: # log_format = '[DEFAULT] %(message)s' # reset logging logging.getLogger().handlers = [] try: print 'Writing log to ' + path.join(opts.workdir, 'process.log') logging.basicConfig(level=logging.INFO, format=log_format, filename=path.join(opts.workdir, 'process.log'), filemode='aw') except IOError: logging.basicConfig(level=logging.DEBUG, format=log_format, filename=path.join(opts.workdir, 'process.log2'), filemode='aw') # to display log on stdout also logging.getLogger().addHandler(logging.StreamHandler()) # write version log vlog_path = path.join(opts.workdir, 'TADbit_and_dependencies_versions.log') dependencies = get_dependencies_version() if not path.exists(vlog_path) or open(vlog_path).readlines() != dependencies: logging.info('Writing versions of TADbit and dependencies') vlog = open(vlog_path, 'w') vlog.write(dependencies) vlog.close() # check GEM mapper extra options if opts.gem_param: opts.gem_param = dict([o.split(':') for o in opts.gem_param]) else: opts.gem_param = {} gem_valid_option = set(["granularity", "q", "quality-format", "gem-quality-threshold", "mismatch-alphabet", "m", "e", "min-matched-bases", "max-big-indel-length", "s", "strata-after-best", "fast-mapping", "unique-mapping", "d", "D", "allow-incomplete-strata", "max-decoded-matches", "min-decoded-strata", "p", "paired-end-alignment", "b", "map-both-ends", "min-insert-size", "max-insert-size", "E", "max-extendable-matches", "max-extensions-per-match", "unique-pairing"]) for k in opts.gem_param: if not k in gem_valid_option: raise NotImplementedError(('ERROR: option "%s" not a valid GEM option' 'or not suported by this tool.') % k) # check if job already run using md5 digestion of parameters if already_run(opts): exit('WARNING: exact same job already computed, see JOBs table above')
def get_options(): """ parse option from call """ parser = ArgumentParser( usage="%(prog)s [options] [--cfg CONFIG_PATH]", formatter_class=lambda prog: HelpFormatter(prog, width=95, max_help_position=27)) glopts = parser.add_argument_group('General arguments') taddet = parser.add_argument_group('TAD detection arguments') optimo = parser.add_argument_group('Optimization of IMP arguments') modelo = parser.add_argument_group('Modeling with optimal IMP arguments') descro = parser.add_argument_group('Descriptive, optional arguments') analyz = parser.add_argument_group('Output arguments') ## Define analysis actions: actions = {0 : "do nothing", 1 : "column filtering", 2 : "TAD borders", 3 : "TAD alignment", 4 : "optimization plot", 5 : "correlation real/models", 6 : "z-score plot", 7 : "constraints", 8 : "objective function", 9 : "centroid", 10 : "consistency", 11 : "density", 12 : "contact map", 13 : "walking angle", 14 : "persistence length", 15 : "accessibility", 16 : "interaction"} parser.add_argument('--usage', dest='usage', action="store_true", default=False, help='''show detailed usage documentation, with examples and exit''') parser.add_argument('--cfg', dest='cfg', metavar="PATH", action='store', default=None, type=str, help='path to a configuration file with predefined ' + 'parameters') parser.add_argument('--analyze_only', dest='analyze_only', action='store_true', default=False, help=('load precomputed models in outdir, ' + 'skip optimization, modeling')) parser.add_argument('--optimize_only', dest='optimize_only', default=False, action='store_true', help='do the optimization of the region and exit') parser.add_argument('--tad_only', dest='tad_only', action="store_true", default=False, help='[%(default)s] exit after searching for TADs') parser.add_argument('--ncpus', dest='ncpus', metavar="INT", default=1, type=int, help='[%(default)s] Number of CPUs to use') ######################################### # GENERAL glopts.add_argument( '--root_path', dest='root_path', metavar="PATH", default='', type=str, help=('path to search for data files (just pass file name' + 'in "data")')) glopts.add_argument('--data', dest='data', metavar="PATH", nargs='+', type=str, help='''path to file(s) with Hi-C data matrix. If many, experiments will be summed up. I.e.: --data replicate_1.txt replicate_2.txt''') glopts.add_argument('--xname', dest='xname', metavar="STR", nargs='+', default=[], type=str, help='''[file name] experiment name(s). Use same order as data.''') glopts.add_argument('--norm', dest='norm', metavar="PATH", nargs='+', type=str, help='path to file(s) with normalizedHi-C data matrix.') glopts.add_argument('--nodiag', dest='nodiag', action='store_true', help='''If the matrix does not contain self interacting bins (only zeroes in the diagonal)''') glopts.add_argument('--filt', dest='filt', metavar='INT', default=90, help='''Filter out column with more than a given percentage of zeroes''') glopts.add_argument('--crm', dest='crm', metavar="NAME", help='chromosome name') glopts.add_argument('--beg', dest='beg', metavar="INT", type=float, default=None, help='genomic coordinate from which to start modeling') glopts.add_argument('--end', dest='end', metavar="INT", type=float, help='genomic coordinate where to end modeling') glopts.add_argument('--res', dest='res', metavar="INT", type=int, help='resolution of the Hi-C experiment') glopts.add_argument('--outdir', dest='outdir', metavar="PATH", default=None, help='out directory for results') ######################################### # TADs taddet.add_argument('--tad', dest='tad', action="store_true", default=False, help='[%(default)s] search for TADs in experiments') taddet.add_argument('--centromere', dest='centromere', action="store_true", default=False, help='[%(default)s] search for centromeric region') taddet.add_argument('--group', dest='group', nargs='+', type=int, default=0, metavar='INT', help='''[all together] How to group Hi-C experiments for the detection of TAD borders. I.e.: "--exp_group 2 2 1" first 2 experiments used together, next 2 also, and last alone (batch_mode option used)''') ######################################### # MODELING modelo.add_argument('--nmodels_mod', dest='nmodels_mod', metavar="INT", default='5000', type=int, help=('[%(default)s] number of models to generate for' + ' modeling')) modelo.add_argument('--nkeep_mod', dest='nkeep_mod', metavar="INT", default='1000', type=int, help=('[%(default)s] number of models to keep for ' + 'modeling')) ######################################### # OPTIMIZATION optimo.add_argument('--maxdist', action='store', metavar="LIST", default='400', dest='maxdist', help='range of numbers for maxdist' + ', i.e. 400:1000:100 -- or just a number') optimo.add_argument('--upfreq', dest='upfreq', metavar="LIST", default='0', help='range of numbers for upfreq' + ', i.e. 0:1.2:0.3 -- or just a number') optimo.add_argument('--lowfreq', dest='lowfreq', metavar="LIST", default='0', help='range of numbers for lowfreq' + ', i.e. -1.2:0:0.3 -- or just a number') optimo.add_argument('--scale', dest='scale', metavar="LIST", default="0.01", help='[%(default)s] range of numbers to be test as ' + 'optimal scale value, i.e. 0.005:0.01:0.001 -- Can ' + 'also pass only one number') optimo.add_argument('--dcutoff', dest='dcutoff', metavar="LIST", default="2", help='[%(default)s] range of numbers to be test as ' + 'optimal distance cutoff parameter (distance, in ' + 'number of beads, from which to consider 2 beads as ' + 'being close), i.e. 1:5:0.5 -- Can also pass only one' + ' number') optimo.add_argument('--nmodels_opt', dest='nmodels_opt', metavar="INT", default='500', type=int, help='[%(default)s] number of models to generate for ' + 'optimization') optimo.add_argument('--nkeep_opt', dest='nkeep_opt', metavar="INT", default='100', type=int, help='[%(default)s] number of models to keep for ' + 'optimization') optimo.add_argument('--force_opt', dest='optimize_from_scratch', action="store_true", default=False, help='''[%(default)s] do not take into account previous optimizations. Usefull for running in parallel in a cluster for example.''') ######################################### # DESCRIPTION descro.add_argument('--species', dest='species', metavar="STRING", default='UNKNOWN', help='species name, with no spaces, i.e.: homo_sapiens') descro.add_argument('--cell', dest='cell', metavar="STRING", help='cell type name') descro.add_argument('--exp_type', dest='exp_type', metavar="STRING", help='experiment type name (i.e.: Hi-C)') descro.add_argument('--assembly', dest='assembly', metavar="STRING", default=None, help='''NCBI ID of the original assembly (i.e.: NCBI36 for human)''') descro.add_argument('--enzyme', dest='enzyme', metavar="STRING", default=None, help='''name of the enzyme used to digest chromatin (i.e. HindIII)''') descro.add_argument('--identifier', dest='identifier', metavar="STRING", default=None, help='''NCBI identifier of the experiment''') descro.add_argument('--project', dest='project', metavar="STRING", default=None, help='''project name''') ######################################### # OUTPUT analyz.add_argument('--analyze', dest='analyze', nargs='+', choices=range(len(actions)), type=int, default=range(2, len(actions)), metavar='INT', help=('''[%s] list of numbers representing the analysis to be done. Choose between: %s''' % (' '.join([str(i) for i in range( 2, len(actions))]), '\n'.join(['%s) %s' % (k, actions[k]) for k in actions])))) analyz.add_argument('--not_write_cmm', dest='not_write_cmm', default=False, action='store_true', help='''[%(default)s] do not generate cmm files for each model (Chimera input)''') analyz.add_argument('--not_write_xyz', dest='not_write_xyz', default=False, action='store_true', help='''[%(default)s] do not generate xyz files for each model (3D coordinates)''') parser.add_argument_group(optimo) parser.add_argument_group(modelo) parser.add_argument_group(descro) parser.add_argument_group(analyz) opts = parser.parse_args() if opts.usage: print __doc__ exit() log = '\tSummary of arguments:\n' # merger opts with CFG file and write summary args = reduce(lambda x, y: x + y, [i.strip('-').split('=') for i in sys.argv]) new_opts = {} if opts.cfg: for line in open(opts.cfg): if not '=' in line: continue if line.startswith('#'): continue key, value = line.split('#')[0].strip().split('=') key = key.strip() value = value.strip() if value == 'True': value = True elif value == 'False': value = False elif key in ['data', 'norm', 'xname', 'group', 'analyze']: new_opts.setdefault(key, []).extend(value.split()) continue new_opts[key] = value # bad key in configuration file for bad_k in set(new_opts.keys()) - set(opts.__dict__.keys()): sys.stderr.write('WARNING: parameter "%s" not recognized' % (bad_k)) for key in sorted(opts.__dict__.keys()): if key in args: log += ' * Command setting %13s to %s\n' % ( key, opts.__dict__[key]) elif key in new_opts: opts.__dict__[key] = new_opts[key] log += ' - Config. setting %13s to %s\n' % ( key, new_opts[key]) else: log += ' o Default setting %13s to %s\n' % ( key, opts.__dict__[key]) # rename analysis actions for i, j in enumerate(opts.analyze): opts.analyze[i] = actions[int(j)] if not opts.data and not opts.norm: sys.stderr.write('MISSING data') exit(parser.print_help()) if not opts.outdir: sys.stderr.write('MISSING outdir') exit(parser.print_help()) if not opts.crm: sys.stderr.write('MISSING crm NAME') exit(parser.print_help()) if not opts.res: sys.stderr.write('MISSING resolution') exit(parser.print_help()) if not opts.analyze_only: if not opts.maxdist: sys.stderr.write('MISSING maxdist') exit(parser.print_help()) if not opts.lowfreq: sys.stderr.write('MISSING lowfreq') exit(parser.print_help()) if not opts.upfreq: sys.stderr.write('MISSING upfreq') exit(parser.print_help()) if not opts.beg and not opts.tad_only: sys.stderr.write('WARNING: no begin coordinate given all') if not opts.end and not opts.tad_only: sys.stderr.write('WARNING: no begin coordinate given all') # groups for TAD detection if not opts.data: opts.data = [None] * len(opts.norm) else: opts.norm = [None] * len(opts.data) if not opts.group: opts.group = [len(opts.data)] else: opts.group = [int(i) for i in opts.group] if sum(opts.group) > len(opts.data): logging.info('ERROR: Number of experiments in groups larger than ' + 'the number of Hi-C data files given.') exit() # this options should stay as this now # opts.scale = '0.01' # switch to number opts.nmodels_mod = int(opts.nmodels_mod) opts.nkeep_mod = int(opts.nkeep_mod ) opts.nmodels_opt = int(opts.nmodels_opt) opts.nkeep_opt = int(opts.nkeep_opt ) opts.ncpus = int(opts.ncpus ) opts.res = int(opts.res ) # TODO: UNDER TEST opts.container = None #['cylinder', 1000, 5000, 100] # do the division to bins if not opts.tad_only: try: opts.beg = int(float(opts.beg) / opts.res) opts.end = int(float(opts.end) / opts.res) if opts.end - opts.beg <= 2: raise Exception('"beg" and "end" parameter should be given in ' + 'genomic coordinates, not bin') except TypeError: pass # Create out-directory name = '{0}_{1}_{2}'.format(opts.crm, opts.beg, opts.end) if not os.path.exists(os.path.join(opts.outdir, name)): os.makedirs(os.path.join(opts.outdir, name)) # write version log if not os.path.exists(os.path.join(opts.outdir, 'TADbit_and_dependencies_versions.log')): vlog = os.path.join(opts.outdir, 'TADbit_and_dependencies_versions.log') vlog = open(vlog, 'w') vlog.write(get_dependencies_version()) vlog.close() # write log if opts.optimize_only: log_format = '[OPTIMIZATION {}_{}_{}_{}_{}] %(message)s'.format( opts.maxdist, opts.upfreq, opts.lowfreq, opts.scale, opts.dcutoff) elif opts.analyze_only: log_format = '[ANALYZE] %(message)s' elif opts.tad_only: log_format = '[TAD] %(message)s' else: log_format = '[DEFAULT] %(message)s' try: logging.basicConfig(filename=os.path.join(opts.outdir, name, name + '.log'), level=logging.INFO, format=log_format) except IOError: logging.basicConfig(filename=os.path.join(opts.outdir, name, name + '.log2'), level=logging.INFO, format=log_format) logging.getLogger().addHandler(logging.StreamHandler()) logging.info(('\n' + log_format.replace(' %(message)s', '') ).join(log.split('\n'))) # update path to Hi-C data adding root directory if opts.root_path and opts.data[0]: for i in xrange(len(opts.data)): logging.info(os.path.join(opts.root_path, opts.data[i])) opts.data[i] = os.path.join(opts.root_path, opts.data[i]) # update path to Hi-C norm adding root directory if opts.root_path and opts.norm[0]: for i in xrange(len(opts.norm)): logging.info(os.path.join(opts.root_path, opts.norm[i])) opts.norm[i] = os.path.join(opts.root_path, opts.norm[i]) return opts
def get_options(): """ parse option from call """ parser = ArgumentParser(usage="%(prog)s [options] [--cfg CONFIG_PATH]", formatter_class=lambda prog: HelpFormatter( prog, width=95, max_help_position=27)) glopts = parser.add_argument_group('General options') mapper = parser.add_argument_group('Mapping options') descro = parser.add_argument_group('Descriptive, optional arguments') glopts.add_argument('--cfg', dest='cfg', metavar="PATH", action='store', default=None, type=str, help='path to a configuration file with predefined ' + 'parameters') glopts.add_argument('--qc_plot', dest='quality_plot', action='store_true', default=False, help='generate a quality plot of FASTQ and exits') glopts.add_argument('-o', '--output', dest='output', metavar="PATH", action='store', default=None, type=str, help='path to output folder') glopts.add_argument('--fastq', dest='fastq', metavar="PATH", action='store', default=None, type=str, help='path to a FASTQ files (can be compressed files)') glopts.add_argument('--genome', dest='genome', metavar="PATH", nargs='+', type=str, help='''paths to file(s) with FASTA files of the reference genome. If many, files will be concatenated. I.e.: --fasta chr_1.fa chr_2.fa In this last case, order is important or the rest of the analysis.''') glopts.add_argument( '--index', dest='index', metavar="PATH", type=str, help='''paths to file(s) with indexed FASTA files of the reference genome.''') glopts.add_argument('--read', dest='read', metavar="INT", type=str, help='read number') glopts.add_argument('--renz', dest='renz', metavar="STR", type=str, help='restriction enzyme name') glopts.add_argument('--chr_name', dest='chr_name', metavar="STR", nargs='+', default=[], type=str, help='''[fasta header] chromosome name(s). Used in the same order as data.''') glopts.add_argument('--tmp', dest='tmp', metavar="PATH", action='store', default=None, type=str, help='''path to a temporary directory (default next to output directory)''') mapper.add_argument('--strategy', dest='strategy', default='frag', choices=['frag', 'iter'], help='''mapping strategy, can be "frag" for fragment based mapping or "iter" for iterative mapping''') mapper.add_argument('--windows', dest='windows', default='auto', nargs='+', help='''for iterative mapping, defines windows. e.g. --windows 20 25 30 35 40 45 50''') mapper.add_argument( '--read_length', dest='read_length', type=int, help='''read length, compulsory in iterative mapping with --windows auto''') mapper.add_argument('--mapping_only', dest='mapping_only', action='store_true', help='only do the mapping does not parse results') descro.add_argument('--species', dest='species', metavar="STR", type=str, help='species name') descro.add_argument( '--descr', dest='description', metavar="LIST", nargs='+', type=str, help='''extra descriptive fields each filed separated by coma, and inside each, name and value separated by column: --descr=cell:lymphoblast,flowcell:C68AEACXX,index:24nf''' ) parser.add_argument_group(glopts) parser.add_argument_group(descro) parser.add_argument_group(mapper) opts = parser.parse_args() if opts.cfg: get_options_from_cfg(opts.cfg, opts) if (opts.strategy == 'iter' and opts.window == 'auto' and not opts.read_length): raise Exception('ERROR: need to input read_length') # check RE name try: _ = RESTRICTION_ENZYMES[opts.renz] except KeyError: print('\n\nERROR: restriction enzyme not found. Use one of:\n\n' + ' '.join(sorted(RESTRICTION_ENZYMES)) + '\n\n') raise KeyError() # check compulsory options if not opts.quality_plot: if not opts.genome: raise Exception('ERROR: genome option required.') if not opts.index: raise Exception('ERROR: index option required.') if not opts.output: raise Exception('ERROR: output option required.') if not opts.fastq: raise Exception('ERROR: fastq option required.') if not opts.renz: raise Exception('ERROR: renz option required.') if not opts.tmp: opts.tmp = opts.output + '_tmp_r' + opts.read if opts.strategy == 'frag': opts.windows = None if opts.strategy == 'iter': raise NotImplementedError() system('mkdir -p ' + opts.output) # write log if opts.mapping_only: log_format = '[MAPPING {} READ{}] %(message)s'.format( opts.fastq, opts.read) else: log_format = '[DEFAULT] %(message)s' # reset logging logging.getLogger().handlers = [] try: print 'Writting log to ' + path.join(opts.output, 'process.log') logging.basicConfig(level=logging.INFO, format=log_format, filename=path.join(opts.output, 'process.log'), filemode='aw') except IOError: logging.basicConfig(level=logging.DEBUG, format=log_format, filename=path.join(opts.output, 'process.log2'), filemode='aw') # to display log on stdout also logging.getLogger().addHandler(logging.StreamHandler()) # write version log vlog_path = path.join(opts.output, 'TADbit_and_dependencies_versions.log') dependencies = get_dependencies_version() if not path.exists( vlog_path) or open(vlog_path).readlines() != dependencies: logging.info('Writting versions of TADbit and dependencies') vlog = open(vlog_path, 'w') vlog.write(dependencies) vlog.close() return opts
def get_options(): """ parse option from call """ parser = ArgumentParser( usage="%(prog)s [options] [--cfg CONFIG_PATH]", formatter_class=lambda prog: HelpFormatter(prog, width=95, max_help_position=27), ) glopts = parser.add_argument_group("General arguments") taddet = parser.add_argument_group("TAD detection arguments") optimo = parser.add_argument_group("Optimization of IMP arguments") modelo = parser.add_argument_group("Modeling with optimal IMP arguments") descro = parser.add_argument_group("Descriptive, optional arguments") analyz = parser.add_argument_group("Output arguments") ## Define analysis actions: actions = { 0: "do nothing", 1: "column filtering", 2: "TAD borders", 3: "TAD alignment", 4: "optimization plot", 5: "correlation real/models", 6: "z-score plot", 7: "constraints", 8: "objective function", 9: "centroid", 10: "consistency", 11: "density", 12: "contact map", 13: "walking angle", 14: "persistence length", 15: "accessibility", 16: "interaction", } parser.add_argument( "--usage", dest="usage", action="store_true", default=False, help="""show detailed usage documentation, with examples and exit""", ) parser.add_argument( "--cfg", dest="cfg", metavar="PATH", action="store", default=None, type=str, help="path to a configuration file with predefined " + "parameters", ) parser.add_argument( "--analyze_only", dest="analyze_only", action="store_true", default=False, help=("load precomputed models in outdir, " + "skip optimization, modeling"), ) parser.add_argument( "--optimize_only", dest="optimize_only", default=False, action="store_true", help="do the optimization of the region and exit", ) parser.add_argument( "--tad_only", dest="tad_only", action="store_true", default=False, help="[%(default)s] exit after searching for TADs", ) parser.add_argument( "--ncpus", dest="ncpus", metavar="INT", default=1, type=int, help="[%(default)s] Number of CPUs to use" ) ######################################### # GENERAL glopts.add_argument( "--root_path", dest="root_path", metavar="PATH", default="", type=str, help=("path to search for data files (just pass file name" + 'in "data")'), ) glopts.add_argument( "--data", dest="data", metavar="PATH", nargs="+", type=str, help="""path to file(s) with Hi-C data matrix. If many, experiments will be summed up. I.e.: --data replicate_1.txt replicate_2.txt""", ) glopts.add_argument( "--xname", dest="xname", metavar="STR", nargs="+", default=[], type=str, help="""[file name] experiment name(s). Use same order as data.""", ) glopts.add_argument( "--norm", dest="norm", metavar="PATH", nargs="+", type=str, help="path to file(s) with normalizedHi-C data matrix.", ) glopts.add_argument( "--nodiag", dest="nodiag", action="store_true", help="""If the matrix does not contain self interacting bins (only zeroes in the diagonal)""", ) glopts.add_argument( "--filt", dest="filt", metavar="INT", default=90, help="""Filter out column with more than a given percentage of zeroes""", ) glopts.add_argument("--crm", dest="crm", metavar="NAME", help="chromosome name") glopts.add_argument( "--beg", dest="beg", metavar="INT", type=float, default=None, help="genomic coordinate from which to start modeling", ) glopts.add_argument("--end", dest="end", metavar="INT", type=float, help="genomic coordinate where to end modeling") glopts.add_argument("--res", dest="res", metavar="INT", type=int, help="resolution of the Hi-C experiment") glopts.add_argument("--outdir", dest="outdir", metavar="PATH", default=None, help="out directory for results") ######################################### # TADs taddet.add_argument( "--tad", dest="tad", action="store_true", default=False, help="[%(default)s] search for TADs in experiments" ) taddet.add_argument( "--centromere", dest="centromere", action="store_true", default=False, help="[%(default)s] search for centromeric region", ) taddet.add_argument( "--group", dest="group", nargs="+", type=int, default=0, metavar="INT", help="""[all together] How to group Hi-C experiments for the detection of TAD borders. I.e.: "--exp_group 2 2 1" first 2 experiments used together, next 2 also, and last alone (batch_mode option used)""", ) ######################################### # MODELING modelo.add_argument( "--nmodels_mod", dest="nmodels_mod", metavar="INT", default="5000", type=int, help=("[%(default)s] number of models to generate for" + " modeling"), ) modelo.add_argument( "--nkeep_mod", dest="nkeep_mod", metavar="INT", default="1000", type=int, help=("[%(default)s] number of models to keep for " + "modeling"), ) ######################################### # OPTIMIZATION optimo.add_argument( "--maxdist", action="store", metavar="LIST", default="400", dest="maxdist", help="range of numbers for maxdist" + ", i.e. 400:1000:100 -- or just a number", ) optimo.add_argument( "--upfreq", dest="upfreq", metavar="LIST", default="0", help="range of numbers for upfreq" + ", i.e. 0:1.2:0.3 -- or just a number", ) optimo.add_argument( "--lowfreq", dest="lowfreq", metavar="LIST", default="0", help="range of numbers for lowfreq" + ", i.e. -1.2:0:0.3 -- or just a number", ) optimo.add_argument( "--scale", dest="scale", metavar="LIST", default="0.01", help="[%(default)s] range of numbers to be test as " + "optimal scale value, i.e. 0.005:0.01:0.001 -- Can " + "also pass only one number", ) optimo.add_argument( "--dcutoff", dest="dcutoff", metavar="LIST", default="2", help="[%(default)s] range of numbers to be test as " + "optimal distance cutoff parameter (distance, in " + "number of beads, from which to consider 2 beads as " + "being close), i.e. 1:5:0.5 -- Can also pass only one" + " number", ) optimo.add_argument( "--nmodels_opt", dest="nmodels_opt", metavar="INT", default="500", type=int, help="[%(default)s] number of models to generate for " + "optimization", ) optimo.add_argument( "--nkeep_opt", dest="nkeep_opt", metavar="INT", default="100", type=int, help="[%(default)s] number of models to keep for " + "optimization", ) optimo.add_argument( "--force_opt", dest="optimize_from_scratch", action="store_true", default=False, help="""[%(default)s] do not take into account previous optimizations. Usefull for running in parallel in a cluster for example.""", ) ######################################### # DESCRIPTION descro.add_argument( "--species", dest="species", metavar="STRING", default="UNKNOWN", help="species name, with no spaces, i.e.: homo_sapiens", ) descro.add_argument("--cell", dest="cell", metavar="STRING", help="cell type name") descro.add_argument("--exp_type", dest="exp_type", metavar="STRING", help="experiment type name (i.e.: Hi-C)") descro.add_argument( "--assembly", dest="assembly", metavar="STRING", default=None, help="""NCBI ID of the original assembly (i.e.: NCBI36 for human)""", ) descro.add_argument( "--enzyme", dest="enzyme", metavar="STRING", default=None, help="""name of the enzyme used to digest chromatin (i.e. HindIII)""", ) descro.add_argument( "--identifier", dest="identifier", metavar="STRING", default=None, help="""NCBI identifier of the experiment""" ) descro.add_argument("--project", dest="project", metavar="STRING", default=None, help="""project name""") ######################################### # OUTPUT analyz.add_argument( "--analyze", dest="analyze", nargs="+", choices=range(len(actions)), type=int, default=range(2, len(actions)), metavar="INT", help=( """[%s] list of numbers representing the analysis to be done. Choose between: %s""" % ( " ".join([str(i) for i in range(2, len(actions))]), "\n".join(["%s) %s" % (k, actions[k]) for k in actions]), ) ), ) analyz.add_argument( "--not_write_cmm", dest="not_write_cmm", default=False, action="store_true", help="""[%(default)s] do not generate cmm files for each model (Chimera input)""", ) analyz.add_argument( "--not_write_xyz", dest="not_write_xyz", default=False, action="store_true", help="""[%(default)s] do not generate xyz files for each model (3D coordinates)""", ) analyz.add_argument( "--not_write_json", dest="not_write_json", default=False, action="store_true", help="""[%(default)s] do not generate json file.""", ) parser.add_argument_group(optimo) parser.add_argument_group(modelo) parser.add_argument_group(descro) parser.add_argument_group(analyz) opts = parser.parse_args() if opts.usage: print __doc__ exit() log = "\tSummary of arguments:\n" # merger opts with CFG file and write summary args = reduce(lambda x, y: x + y, [i.strip("-").split("=") for i in sys.argv]) new_opts = {} if opts.cfg: for line in open(opts.cfg): if not "=" in line: continue if line.startswith("#"): continue key, value = line.split("#")[0].strip().split("=") key = key.strip() value = value.strip() if value == "True": value = True elif value == "False": value = False elif key in ["data", "norm", "xname", "group", "analyze"]: new_opts.setdefault(key, []).extend(value.split()) continue new_opts[key] = value # bad key in configuration file opts.__dict__["description"] = {} for bad_k in set(new_opts.keys()) - set(opts.__dict__.keys()): sys.stderr.write('WARNING: parameter "%s" not recognized (used as description)\n' % (bad_k)) try: opts.__dict__["description"][bad_k] = int(new_opts[bad_k]) except ValueError: opts.__dict__["description"][bad_k] = new_opts[bad_k] for key in sorted(opts.__dict__.keys()): if key in args: log += " * Command setting %13s to %s\n" % (key, opts.__dict__[key]) elif key in new_opts: opts.__dict__[key] = new_opts[key] log += " - Config. setting %13s to %s\n" % (key, new_opts[key]) else: log += " o Default setting %13s to %s\n" % (key, opts.__dict__[key]) # rename analysis actions for i, j in enumerate(opts.analyze): opts.analyze[i] = actions[int(j)] if not opts.data and not opts.norm: sys.stderr.write("MISSING data") exit(parser.print_help()) if not opts.outdir: sys.stderr.write("MISSING outdir") exit(parser.print_help()) if not opts.crm: sys.stderr.write("MISSING crm NAME") exit(parser.print_help()) if not opts.res: sys.stderr.write("MISSING resolution") exit(parser.print_help()) if not opts.analyze_only: if not opts.maxdist: sys.stderr.write("MISSING maxdist") exit(parser.print_help()) if not opts.lowfreq: sys.stderr.write("MISSING lowfreq") exit(parser.print_help()) if not opts.upfreq: sys.stderr.write("MISSING upfreq") exit(parser.print_help()) if not opts.beg and not opts.tad_only: sys.stderr.write("WARNING: no begin coordinate given all") if not opts.end and not opts.tad_only: sys.stderr.write("WARNING: no begin coordinate given all") # groups for TAD detection if not opts.data: opts.data = [None] * len(opts.norm) else: opts.norm = [None] * len(opts.data) if not opts.group: opts.group = [len(opts.data)] else: opts.group = [int(i) for i in opts.group] if sum(opts.group) > len(opts.data): logging.info("ERROR: Number of experiments in groups larger than " + "the number of Hi-C data files given.") exit() # this options should stay as this now # opts.scale = '0.01' # switch to number opts.nmodels_mod = int(opts.nmodels_mod) opts.nkeep_mod = int(opts.nkeep_mod) opts.nmodels_opt = int(opts.nmodels_opt) opts.nkeep_opt = int(opts.nkeep_opt) opts.ncpus = int(opts.ncpus) opts.res = int(opts.res) # TODO: UNDER TEST opts.container = None # ['cylinder', 1000, 5000, 100] # do the division to bins if not opts.tad_only: try: opts.beg = int(float(opts.beg) / opts.res) opts.end = int(float(opts.end) / opts.res) if opts.end - opts.beg <= 2: raise Exception('"beg" and "end" parameter should be given in ' + "genomic coordinates, not bin") except TypeError: pass # Create out-directory name = "{0}_{1}_{2}".format(opts.crm, opts.beg, opts.end) if not os.path.exists(os.path.join(opts.outdir, name)): os.makedirs(os.path.join(opts.outdir, name)) # write version log if not os.path.exists(os.path.join(opts.outdir, "TADbit_and_dependencies_versions.log")): vlog = os.path.join(opts.outdir, "TADbit_and_dependencies_versions.log") vlog = open(vlog, "w") vlog.write(get_dependencies_version()) vlog.close() # write log if opts.optimize_only: log_format = "[OPTIMIZATION {}_{}_{}_{}_{}] %(message)s".format( opts.maxdist, opts.upfreq, opts.lowfreq, opts.scale, opts.dcutoff ) elif opts.analyze_only: log_format = "[ANALYZE] %(message)s" elif opts.tad_only: log_format = "[TAD] %(message)s" else: log_format = "[DEFAULT] %(message)s" try: logging.basicConfig( filename=os.path.join(opts.outdir, name, name + ".log"), level=logging.INFO, format=log_format ) except IOError: logging.basicConfig( filename=os.path.join(opts.outdir, name, name + ".log2"), level=logging.INFO, format=log_format ) logging.getLogger().addHandler(logging.StreamHandler()) logging.info(("\n" + log_format.replace(" %(message)s", "")).join(log.split("\n"))) # update path to Hi-C data adding root directory if opts.root_path and opts.data[0]: for i in xrange(len(opts.data)): logging.info(os.path.join(opts.root_path, opts.data[i])) opts.data[i] = os.path.join(opts.root_path, opts.data[i]) # update path to Hi-C norm adding root directory if opts.root_path and opts.norm[0]: for i in xrange(len(opts.norm)): logging.info(os.path.join(opts.root_path, opts.norm[i])) opts.norm[i] = os.path.join(opts.root_path, opts.norm[i]) return opts
#!/usr/bin/python #================================================================================================== # Created on: 2016-04-05 # Usage: ./print_tadbit_and_dependencies_version.py # Author: Javier Quilez (GitHub: jaquol) # Goal: print the version of TADbit and its dependencies #================================================================================================== # import packages import pytadbit import re # print TADbit and dependencies versions print re.sub(r"\n+", ";", pytadbit.get_dependencies_version()).replace(" ", "")
def get_options(): """ parse option from call """ parser = ArgumentParser( usage="%(prog)s [options] [--cfg CONFIG_PATH]", formatter_class=lambda prog: HelpFormatter(prog, width=95, max_help_position=27)) glopts = parser.add_argument_group('General arguments') taddet = parser.add_argument_group('TAD detection arguments') optimo = parser.add_argument_group('Optimization of IMP arguments') modelo = parser.add_argument_group('Modeling with optimal IMP arguments') descro = parser.add_argument_group('Descriptive, optional arguments') analyz = parser.add_argument_group('Output arguments') ## Define analysis actions: actions = {0 : "do nothing", 1 : "column filtering", 2 : "TAD borders", 3 : "TAD alignment", 4 : "optimization plot", 5 : "correlation real/models", 6 : "z-score plot", 7 : "constraints", 8 : "objective function", 9 : "centroid", 10 : "consistency", 11 : "density", 12 : "contact map", 13 : "walking angle", 14 : "persistence length", 15 : "accessibility", 16 : "interaction"} parser.add_argument('--usage', dest='usage', action="store_true", default=False, help='''show detailed usage documentation, with examples and exit''') parser.add_argument('--cfg', dest='cfg', metavar="PATH", action='store', default=None, type=str, help='path to a configuration file with predefined ' + 'parameters') parser.add_argument('--analyze_only', dest='analyze_only', action='store_true', default=False, help=('load precomputed models in outdir, ' + 'skip optimization, modeling')) parser.add_argument('--optimize_only', dest='optimize_only', default=False, action='store_true', help='do the optimization of the region and exit') parser.add_argument('--tad_only', dest='tad_only', action="store_true", default=False, help='[%(default)s] exit after searching for TADs') parser.add_argument('--ncpus', dest='ncpus', metavar="INT", default=1, type=int, help='[%(default)s] Number of CPUs to use') ######################################### # GENERAL glopts.add_argument( '--root_path', dest='root_path', metavar="PATH", default='', type=str, help=('path to search for data files (just pass file name' + 'in "data")')) glopts.add_argument('--data', dest='data', metavar="PATH", nargs='+', type=str, help='''path to file(s) with Hi-C data matrix. If many, experiments will be summed up. I.e.: --data replicate_1.txt replicate_2.txt''') glopts.add_argument('--xname', dest='xname', metavar="STR", nargs='+', default=[], type=str, help='''[file name] experiment name(s). Use same order as data.''') glopts.add_argument('--norm', dest='norm', metavar="PATH", nargs='+', type=str, help='path to file(s) with normalizedHi-C data matrix.') glopts.add_argument('--nodiag', dest='nodiag', action='store_true', help='''If the matrix does not contain self interacting bins (only zeroes in the diagonal)''') glopts.add_argument('--filt', dest='filt', metavar='INT', default=90, help='''Filter out column with more than a given percentage of zeroes''') glopts.add_argument('--crm', dest='crm', metavar="NAME", help='chromosome name') glopts.add_argument('--beg', dest='beg', metavar="INT", type=float, default=None, help='genomic coordinate from which to start modeling') glopts.add_argument('--end', dest='end', metavar="INT", type=float, help='genomic coordinate where to end modeling') glopts.add_argument('--res', dest='res', metavar="INT", type=int, help='resolution of the Hi-C experiment') glopts.add_argument('--outdir', dest='outdir', metavar="PATH", default=None, help='out directory for results') ######################################### # TADs taddet.add_argument('--tad', dest='tad', action="store_true", default=False, help='[%(default)s] search for TADs in experiments') taddet.add_argument('--centromere', dest='centromere', action="store_true", default=False, help='[%(default)s] search for centromeric region') taddet.add_argument('--group', dest='group', nargs='+', type=int, default=0, metavar='INT', help='''[all together] How to group Hi-C experiments for the detection of TAD borders. I.e.: "--exp_group 2 2 1" first 2 experiments used together, next 2 also, and last alone (batch_mode option used)''') ######################################### # MODELING modelo.add_argument('--nmodels_mod', dest='nmodels_mod', metavar="INT", default='5000', type=int, help=('[%(default)s] number of models to generate for' + ' modeling')) modelo.add_argument('--nkeep_mod', dest='nkeep_mod', metavar="INT", default='1000', type=int, help=('[%(default)s] number of models to keep for ' + 'modeling')) ######################################### # OPTIMIZATION optimo.add_argument('--maxdist', action='store', metavar="LIST", default='400', dest='maxdist', help='range of numbers for maxdist' + ', i.e. 400:1000:100 -- or just a number') optimo.add_argument('--upfreq', dest='upfreq', metavar="LIST", default='0', help='range of numbers for upfreq' + ', i.e. 0:1.2:0.3 -- or just a number') optimo.add_argument('--lowfreq', dest='lowfreq', metavar="LIST", default='0', help='range of numbers for lowfreq' + ', i.e. -1.2:0:0.3 -- or just a number') optimo.add_argument('--scale', dest='scale', metavar="LIST", default="0.01", help='[%(default)s] range of numbers to be test as ' + 'optimal scale value, i.e. 0.005:0.01:0.001 -- Can ' + 'also pass only one number') optimo.add_argument('--dcutoff', dest='dcutoff', metavar="LIST", default="2", help='[%(default)s] range of numbers to be test as ' + 'optimal distance cutoff parameter (distance, in ' + 'number of beads, from which to consider 2 beads as ' + 'being close), i.e. 1:5:0.5 -- Can also pass only one' + ' number') optimo.add_argument('--nmodels_opt', dest='nmodels_opt', metavar="INT", default='500', type=int, help='[%(default)s] number of models to generate for ' + 'optimization') optimo.add_argument('--nkeep_opt', dest='nkeep_opt', metavar="INT", default='100', type=int, help='[%(default)s] number of models to keep for ' + 'optimization') optimo.add_argument('--force_opt', dest='optimize_from_scratch', action="store_true", default=False, help='''[%(default)s] do not take into account previous optimizations. Usefull for running in parallel in a cluster for example.''') ######################################### # DESCRIPTION descro.add_argument('--species', dest='species', metavar="STRING", default='UNKNOWN', help='species name, with no spaces, i.e.: homo_sapiens') descro.add_argument('--cell', dest='cell', metavar="STRING", help='cell type name') descro.add_argument('--exp_type', dest='exp_type', metavar="STRING", help='experiment type name (i.e.: Hi-C)') descro.add_argument('--assembly', dest='assembly', metavar="STRING", default=None, help='''NCBI ID of the original assembly (i.e.: NCBI36 for human)''') descro.add_argument('--enzyme', dest='enzyme', metavar="STRING", default=None, help='''name of the enzyme used to digest chromatin (i.e. HindIII)''') descro.add_argument('--identifier', dest='identifier', metavar="STRING", default=None, help='''NCBI identifier of the experiment''') descro.add_argument('--project', dest='project', metavar="STRING", default=None, help='''project name''') ######################################### # OUTPUT analyz.add_argument('--analyze', dest='analyze', nargs='+', choices=range(len(actions)), type=int, default=range(2, len(actions)), metavar='INT', help=('''[%s] list of numbers representing the analysis to be done. Choose between: %s''' % (' '.join([str(i) for i in range( 2, len(actions))]), '\n'.join(['%s) %s' % (k, actions[k]) for k in actions])))) analyz.add_argument('--not_write_cmm', dest='not_write_cmm', default=False, action='store_true', help='''[%(default)s] do not generate cmm files for each model (Chimera input)''') analyz.add_argument('--not_write_xyz', dest='not_write_xyz', default=False, action='store_true', help='''[%(default)s] do not generate xyz files for each model (3D coordinates)''') analyz.add_argument('--not_write_json', dest='not_write_json', default=False, action='store_true', help='''[%(default)s] do not generate json file.''') parser.add_argument_group(optimo) parser.add_argument_group(modelo) parser.add_argument_group(descro) parser.add_argument_group(analyz) opts = parser.parse_args() if opts.usage: print __doc__ exit() log = '\tSummary of arguments:\n' # merger opts with CFG file and write summary args = reduce(lambda x, y: x + y, [i.strip('-').split('=') for i in sys.argv]) new_opts = {} if opts.cfg: for line in open(opts.cfg): if not '=' in line: continue if line.startswith('#'): continue key, value = line.split('#')[0].strip().split('=') key = key.strip() value = value.strip() if value == 'True': value = True elif value == 'False': value = False elif key in ['data', 'norm', 'xname', 'group', 'analyze']: new_opts.setdefault(key, []).extend(value.split()) continue new_opts[key] = value # bad key in configuration file opts.__dict__['description'] = {} for bad_k in set(new_opts.keys()) - set(opts.__dict__.keys()): sys.stderr.write('WARNING: parameter "%s" not recognized (used as description)\n' % (bad_k)) try: opts.__dict__['description'][bad_k] = int(new_opts[bad_k]) except ValueError: opts.__dict__['description'][bad_k] = new_opts[bad_k] for key in sorted(opts.__dict__.keys()): if key in args: log += ' * Command setting %13s to %s\n' % ( key, opts.__dict__[key]) elif key in new_opts: opts.__dict__[key] = new_opts[key] log += ' - Config. setting %13s to %s\n' % ( key, new_opts[key]) else: log += ' o Default setting %13s to %s\n' % ( key, opts.__dict__[key]) # rename analysis actions for i, j in enumerate(opts.analyze): opts.analyze[i] = actions[int(j)] if not opts.data and not opts.norm: sys.stderr.write('MISSING data') exit(parser.print_help()) if not opts.outdir: sys.stderr.write('MISSING outdir') exit(parser.print_help()) if not opts.crm: sys.stderr.write('MISSING crm NAME') exit(parser.print_help()) if not opts.res: sys.stderr.write('MISSING resolution') exit(parser.print_help()) if not opts.analyze_only: if not opts.maxdist: sys.stderr.write('MISSING maxdist') exit(parser.print_help()) if not opts.lowfreq: sys.stderr.write('MISSING lowfreq') exit(parser.print_help()) if not opts.upfreq: sys.stderr.write('MISSING upfreq') exit(parser.print_help()) if not opts.beg and not opts.tad_only: sys.stderr.write('WARNING: no begin coordinate given all') if not opts.end and not opts.tad_only: sys.stderr.write('WARNING: no begin coordinate given all') # groups for TAD detection if not opts.data: opts.data = [None] * len(opts.norm) else: opts.norm = [None] * len(opts.data) if not opts.group: opts.group = [len(opts.data)] else: opts.group = [int(i) for i in opts.group] if sum(opts.group) > len(opts.data): logging.info('ERROR: Number of experiments in groups larger than ' + 'the number of Hi-C data files given.') exit() # this options should stay as this now # opts.scale = '0.01' # switch to number opts.nmodels_mod = int(opts.nmodels_mod) opts.nkeep_mod = int(opts.nkeep_mod ) opts.nmodels_opt = int(opts.nmodels_opt) opts.nkeep_opt = int(opts.nkeep_opt ) opts.ncpus = int(opts.ncpus ) opts.res = int(opts.res ) # TODO: UNDER TEST opts.container = None #['cylinder', 1000, 5000, 100] # do the division to bins if not opts.tad_only: try: opts.beg = int(float(opts.beg) / opts.res) opts.end = int(float(opts.end) / opts.res) if opts.end - opts.beg <= 2: raise Exception('"beg" and "end" parameter should be given in ' + 'genomic coordinates, not bin') except TypeError: pass # Create out-directory name = '{0}_{1}_{2}'.format(opts.crm, opts.beg, opts.end) if not os.path.exists(os.path.join(opts.outdir, name)): os.makedirs(os.path.join(opts.outdir, name)) # write version log if not os.path.exists(os.path.join(opts.outdir, 'TADbit_and_dependencies_versions.log')): vlog = os.path.join(opts.outdir, 'TADbit_and_dependencies_versions.log') vlog = open(vlog, 'w') vlog.write(get_dependencies_version()) vlog.close() # write log if opts.optimize_only: log_format = '[OPTIMIZATION {}_{}_{}_{}_{}] %(message)s'.format( opts.maxdist, opts.upfreq, opts.lowfreq, opts.scale, opts.dcutoff) elif opts.analyze_only: log_format = '[ANALYZE] %(message)s' elif opts.tad_only: log_format = '[TAD] %(message)s' else: log_format = '[DEFAULT] %(message)s' try: logging.basicConfig(filename=os.path.join(opts.outdir, name, name + '.log'), level=logging.INFO, format=log_format) except IOError: logging.basicConfig(filename=os.path.join(opts.outdir, name, name + '.log2'), level=logging.INFO, format=log_format) logging.getLogger().addHandler(logging.StreamHandler()) logging.info(('\n' + log_format.replace(' %(message)s', '') ).join(log.split('\n'))) # update path to Hi-C data adding root directory if opts.root_path and opts.data[0]: for i in xrange(len(opts.data)): logging.info(os.path.join(opts.root_path, opts.data[i])) opts.data[i] = os.path.join(opts.root_path, opts.data[i]) # update path to Hi-C norm adding root directory if opts.root_path and opts.norm[0]: for i in xrange(len(opts.norm)): logging.info(os.path.join(opts.root_path, opts.norm[i])) opts.norm[i] = os.path.join(opts.root_path, opts.norm[i]) return opts
def get_options(): """ parse option from call """ parser = ArgumentParser( usage="%(prog)s [options] [--cfg CONFIG_PATH]", formatter_class=lambda prog: HelpFormatter(prog, width=95, max_help_position=27)) glopts = parser.add_argument_group('General options') mapper = parser.add_argument_group('Mapping options') descro = parser.add_argument_group('Descriptive, optional arguments') glopts.add_argument('--cfg', dest='cfg', metavar="PATH", action='store', default=None, type=str, help='path to a configuration file with predefined ' + 'parameters') glopts.add_argument('--qc_plot', dest='quality_plot', action='store_true', default=False, help='generate a quality plot of FASTQ and exits') glopts.add_argument('-o', '--output', dest='output', metavar="PATH", action='store', default=None, type=str, help='path to output folder') glopts.add_argument('--fastq', dest='fastq', metavar="PATH", action='store', default=None, type=str, help='path to a FASTQ files (can be compressed files)') glopts.add_argument('--genome', dest='genome', metavar="PATH", nargs='+', type=str, help='''paths to file(s) with FASTA files of the reference genome. If many, files will be concatenated. I.e.: --fasta chr_1.fa chr_2.fa In this last case, order is important or the rest of the analysis.''') glopts.add_argument('--index', dest='index', metavar="PATH", type=str, help='''paths to file(s) with indexed FASTA files of the reference genome.''') glopts.add_argument('--read', dest='read', metavar="INT", type=str, help='read number') glopts.add_argument('--renz', dest='renz', metavar="STR", type=str, help='restriction enzyme name') glopts.add_argument('--chr_name', dest='chr_name', metavar="STR", nargs='+', default=[], type=str, help='''[fasta header] chromosome name(s). Used in the same order as data.''') glopts.add_argument('--tmp', dest='tmp', metavar="PATH", action='store', default=None, type=str, help='''path to a temporary directory (default next to output directory)''') mapper.add_argument('--strategy', dest='strategy', default='frag', choices=['frag', 'iter'], help='''mapping strategy, can be "frag" for fragment based mapping or "iter" for iterative mapping''') mapper.add_argument('--windows', dest='windows', default='auto', nargs='+', help='''for iterative mapping, defines windows. e.g. --windows 20 25 30 35 40 45 50''') mapper.add_argument('--read_length', dest='read_length', type=int, help='''read length, compulsory in iterative mapping with --windows auto''') mapper.add_argument('--mapping_only', dest='mapping_only', action='store_true', help='only do the mapping does not parse results') descro.add_argument('--species', dest='species', metavar="STR", type=str, help='species name') descro.add_argument('--descr', dest='description', metavar="LIST", nargs='+', type=str, help='''extra descriptive fields each filed separated by coma, and inside each, name and value separated by column: --descr=cell:lymphoblast,flowcell:C68AEACXX,index:24nf''') parser.add_argument_group(glopts) parser.add_argument_group(descro) parser.add_argument_group(mapper) opts = parser.parse_args() if opts.cfg: get_options_from_cfg(opts.cfg, opts) if (opts.strategy == 'iter' and opts.window == 'auto' and not opts.read_length): raise Exception('ERROR: need to input read_length') # check RE name try: _ = RESTRICTION_ENZYMES[opts.renz] except KeyError: print ('\n\nERROR: restriction enzyme not found. Use one of:\n\n' + ' '.join(sorted(RESTRICTION_ENZYMES)) + '\n\n') raise KeyError() # check compulsory options if not opts.quality_plot: if not opts.genome: raise Exception('ERROR: genome option required.') if not opts.index : raise Exception('ERROR: index option required.') if not opts.output: raise Exception('ERROR: output option required.') if not opts.fastq : raise Exception('ERROR: fastq option required.') if not opts.renz : raise Exception('ERROR: renz option required.') if not opts.tmp: opts.tmp = opts.output + '_tmp_r' + opts.read if opts.strategy == 'frag': opts.windows = None if opts.strategy == 'iter': raise NotImplementedError() system('mkdir -p ' + opts.output) # write log if opts.mapping_only: log_format = '[MAPPING {} READ{}] %(message)s'.format(opts.fastq, opts.read) else: log_format = '[DEFAULT] %(message)s' # reset logging logging.getLogger().handlers = [] try: print 'Writting log to ' + path.join(opts.output, 'process.log') logging.basicConfig(level=logging.INFO, format=log_format, filename=path.join(opts.output, 'process.log'), filemode='aw') except IOError: logging.basicConfig(level=logging.DEBUG, format=log_format, filename=path.join(opts.output, 'process.log2'), filemode='aw') # to display log on stdout also logging.getLogger().addHandler(logging.StreamHandler()) # write version log vlog_path = path.join(opts.output, 'TADbit_and_dependencies_versions.log') dependencies = get_dependencies_version() if not path.exists(vlog_path) or open(vlog_path).readlines() != dependencies: logging.info('Writting versions of TADbit and dependencies') vlog = open(vlog_path, 'w') vlog.write(dependencies) vlog.close() return opts