def submit_function(*args, **kwargs): if "submit" in kwargs and kwargs["submit"]: del kwargs["submit"] submit_args, args_file = _pickle_args(args, kwargs) module_file = os.path.abspath( sys.modules[func.__module__].__file__) submit(snip(__file__), "run_pickled", params=[snip(module_file), function_name, args_file], **submit_args) else: # remove job contral options before running function for x in ("submit", "job_options", "job_queue"): if x in kwargs: del kwargs[x] return func(*args, **kwargs)
def submit_function(*args, **kwargs): if "submit" in kwargs and kwargs["submit"]: del kwargs["submit"] submit_args, args_file = _pickle_args(args, kwargs) module_file = os.path.abspath( sys.modules[func.__module__].__file__) submit(snip(__file__), "run_pickled", params=[snip(module_file), function_name, args_file], **submit_args) else: # remove job contral options before running function for x in ("submit", "job_options", "job_queue", "job_memory", "job_threads"): if x in kwargs: del kwargs[x] return func(*args, **kwargs)
def run_report(clean=True, with_pipeline_status=True, pipeline_status_format="svg"): '''run CGATreport. This will also run ruffus to create an svg image of the pipeline status unless *with_pipeline_status* is set to False. The image will be saved into the export directory. ''' if with_pipeline_status: targetdir = PARAMS["exportdir"] if not os.path.exists(targetdir): os.mkdir(targetdir) pipeline_printout_graph( os.path.join( targetdir, "pipeline.%s" % pipeline_status_format), pipeline_status_format, ["full"], checksum_level=PARAMS["ruffus_checksums_level"] ) dirname, basename = os.path.split(getCaller().__file__) report_engine = PARAMS.get("report_engine", "cgatreport") assert report_engine in ('sphinxreport', 'cgatreport') docdir = os.path.join(dirname, "pipeline_docs", snip(basename, ".py")) themedir = os.path.join(dirname, "pipeline_docs", "themes") relpath = os.path.relpath(docdir) trackerdir = os.path.join(docdir, "trackers") # warning: memory gets multiplied by threads, so set it not too # high job_memory = "1G" job_threads = PARAMS["report_threads"] # use a fake X display in order to avoid windows popping up # from R plots. xvfb_command = IOTools.which("xvfb-run") # permit multiple servers using -a option if xvfb_command: xvfb_command += " -a " else: xvfb_command = "" # if there is no DISPLAY variable set, xvfb runs, but # exits with error when killing process. Thus, ignore return # value. # print os.getenv("DISPLAY"), "command=", xvfb_command if not os.getenv("DISPLAY"): erase_return = "|| true" else: erase_return = "" # in the current version, xvfb always returns with an error, thus # ignore these. erase_return = "|| true" if clean: clean = """rm -rf report _cache _static;""" else: clean = "" # with sphinx >1.3.1 the PYTHONPATH needs to be set explicitely as # the virtual environment seems to be stripped. It is thus set to # the contents of the current sys.path syspath = ":".join(sys.path) statement = ''' %(clean)s (export SPHINX_DOCSDIR=%(docdir)s; export SPHINX_THEMEDIR=%(themedir)s; export PYTHONPATH=%(syspath)s; %(xvfb_command)s %(report_engine)s-build --num-jobs=%(report_threads)s sphinx-build -b html -d %(report_doctrees)s -c . -j %(report_threads)s %(docdir)s %(report_html)s >& report.log %(erase_return)s ) ''' run() E.info('the report is available at %s' % os.path.abspath( os.path.join(PARAMS['report_html'], "contents.html")))
def mergeAndLoad(infiles, outfile, suffix=None, columns=(0, 1), regex=None, row_wise=True, retry=True, options="", prefixes=None): '''merge multiple categorical tables and load into a database. The tables are merged and entered row-wise, i.e, the contents of each file are a row. For example, the statement:: mergeAndLoad(['file1.txt', 'file2.txt'], "test_table.load") with the two files:: > cat file1.txt Category Result length 12 width 100 > cat file2.txt Category Result length 20 width 50 will be added into table ``test_table`` as:: track length width file1 12 100 file2 20 50 If row-wise is set:: mergeAndLoad(['file1.txt', 'file2.txt'], "test_table.load", row_wise=True) ``test_table`` will be transposed and look like this:: track file1 file2 length 12 20 width 20 50 Arguments --------- infiles : list Filenames of the input data outfile : string Output filename. This will contain the logging information. The table name is derived from `outfile`. suffix : string If `suffix` is given, the suffix will be removed from the filenames. columns : list The columns to be taken. By default, the first two columns are taken with the first being the key. Filenames are stored in a ``track`` column. Directory names are chopped off. If `columns` is set to None, all columns will be taken. Here, column names will receive a prefix given by `prefixes`. If `prefixes` is None, the filename will be added as a prefix. regex : string If set, the full filename will be used to extract a track name via the supplied regular expression. row_wise : bool If set to False, each table will be a column in the resulting table. This is useful if histograms are being merged. retry : bool If True, multiple attempts will be made if the data can not be loaded at the first try, for example if a table is locked. options : string Command line options for the `csv2db.py` script. prefixes : list If given, the respective prefix will be added to each column. The number of `prefixes` and `infiles` needs to be the same. ''' PARAMS = getParams() if len(infiles) == 0: raise ValueError("no files for merging") if suffix: header = ",".join([os.path.basename(snip(x, suffix)) for x in infiles]) elif regex: header = ",".join(["-".join(re.search(regex, x).groups()) for x in infiles]) else: header = ",".join([os.path.basename(x) for x in infiles]) header_stmt = "--header-names=%s" % header if columns: column_filter = "| cut -f %s" % ",".join(map(str, [x + 1 for x in columns])) else: column_filter = "" if prefixes: assert len(prefixes) == len(infiles) header_stmt = "--prefixes=%s" % ",".join(prefixes) else: header_stmt = "--add-file-prefix" if infiles[0].endswith(".gz"): filenames = " ".join( ["<( zcat %s %s )" % (x, column_filter) for x in infiles]) else: filenames = " ".join( ["<( cat %s %s )" % (x, column_filter) for x in infiles]) if row_wise: transform = """| perl -p -e "s/bin/track/" | cgat table2table --transpose""" % PARAMS else: transform = "" load_statement = build_load_statement( toTable(outfile), options="--add-index=track " + options, retry=retry) statement = """cgat combine_tables %(header_stmt)s --skip-titles --missing-value=0 --ignore-empty %(filenames)s %(transform)s | %(load_statement)s > %(outfile)s """ to_cluster = False run()
def mergeAndLoad(infiles, outfile, suffix=None, columns=(0, 1), regex=None, row_wise=True, retry=True, options="", prefixes=None): '''merge multiple categorical tables and load into a database. The tables are merged and entered row-wise, i.e, the contents of each file are a row. For example, the statement:: mergeAndLoad(['file1.txt', 'file2.txt'], "test_table.load") with the two files:: > cat file1.txt Category Result length 12 width 100 > cat file2.txt Category Result length 20 width 50 will be added into table ``test_table`` as:: track length width file1 12 100 file2 20 50 If row-wise is set:: mergeAndLoad(['file1.txt', 'file2.txt'], "test_table.load", row_wise=True) ``test_table`` will be transposed and look like this:: track file1 file2 length 12 20 width 20 50 Arguments --------- infiles : list Filenames of the input data outfile : string Output filename. This will contain the logging information. The table name is derived from `outfile`. suffix : string If `suffix` is given, the suffix will be removed from the filenames. columns : list The columns to be taken. By default, the first two columns are taken with the first being the key. Filenames are stored in a ``track`` column. Directory names are chopped off. If `columns` is set to None, all columns will be taken. Here, column names will receive a prefix given by `prefixes`. If `prefixes` is None, the filename will be added as a prefix. regex : string If set, the full filename will be used to extract a track name via the supplied regular expression. row_wise : bool If set to False, each table will be a column in the resulting table. This is useful if histograms are being merged. retry : bool If True, multiple attempts will be made if the data can not be loaded at the first try, for example if a table is locked. options : string Command line options for the `csv2db.py` script. prefixes : list If given, the respective prefix will be added to each column. The number of `prefixes` and `infiles` needs to be the same. ''' PARAMS = getParams() if len(infiles) == 0: raise ValueError("no files for merging") if suffix: header = ",".join([os.path.basename(snip(x, suffix)) for x in infiles]) elif regex: header = ",".join( ["-".join(re.search(regex, x).groups()) for x in infiles]) else: header = ",".join([os.path.basename(x) for x in infiles]) header_stmt = "--header-names=%s" % header if columns: column_filter = "| cut -f %s" % ",".join( map(str, [x + 1 for x in columns])) else: column_filter = "" if prefixes: assert len(prefixes) == len(infiles) header_stmt = "--prefixes=%s" % ",".join(prefixes) else: header_stmt = "--add-file-prefix" if infiles[0].endswith(".gz"): filenames = " ".join( ["<( zcat %s %s )" % (x, column_filter) for x in infiles]) else: filenames = " ".join( ["<( cat %s %s )" % (x, column_filter) for x in infiles]) if row_wise: transform = """| perl -p -e "s/bin/track/" | cgat table2table --transpose""" % PARAMS else: transform = "" load_statement = build_load_statement(toTable(outfile), options="--add-index=track " + options, retry=retry) statement = """cgat combine_tables %(header_stmt)s --skip-titles --missing-value=0 --ignore-empty %(filenames)s %(transform)s | %(load_statement)s > %(outfile)s """ to_cluster = False run()